diff --git a/README.md b/README.md index 1205e3b..63cc127 100644 --- a/README.md +++ b/README.md @@ -318,7 +318,8 @@ for chunk in openai.ChatCompletion.create( messages=[ {"role": "user", "content": "你好"} ], - stream=True + stream=True + # Specifying stop words in streaming output format is not yet supported and is under development. ): if hasattr(chunk.choices[0].delta, "content"): print(chunk.choices[0].delta.content, end="", flush=True) @@ -329,7 +330,8 @@ response = openai.ChatCompletion.create( messages=[ {"role": "user", "content": "你好"} ], - stream=False + stream=False, + stop=[] # You can add custom stop words here, e.g., stop=["Observation:"] for ReAct prompting. ) print(response.choices[0].message.content) ``` diff --git a/README_CN.md b/README_CN.md index 811d5e2..4764fe4 100644 --- a/README_CN.md +++ b/README_CN.md @@ -323,6 +323,7 @@ for chunk in openai.ChatCompletion.create( {"role": "user", "content": "你好"} ], stream=True + # 流式输出的自定义stopwords功能尚未支持,正在开发中 ): if hasattr(chunk.choices[0].delta, "content"): print(chunk.choices[0].delta.content, end="", flush=True) @@ -333,7 +334,8 @@ response = openai.ChatCompletion.create( messages=[ {"role": "user", "content": "你好"} ], - stream=False + stream=False, + stop=[] # 在此处添加自定义的stop words 例如ReAct prompting时需要增加: stop=["Observation:"]。 ) print(response.choices[0].message.content) ``` diff --git a/openai_api.py b/openai_api.py index da105f3..52da00b 100644 --- a/openai_api.py +++ b/openai_api.py @@ -68,6 +68,7 @@ class ChatCompletionRequest(BaseModel): top_p: Optional[float] = None max_length: Optional[int] = None stream: Optional[bool] = False + stop: Optional[List[str]] = [] class ChatCompletionResponseChoice(BaseModel): @@ -103,7 +104,8 @@ async def create_chat_completion(request: ChatCompletionRequest): if request.messages[-1].role != "user": raise HTTPException(status_code=400, detail="Invalid request") query = request.messages[-1].content - + stop_words = request.stop + stop_words.extend(list(map(lambda x: x[1:], filter(lambda x: x.startswith("\n"), stop_words)))) prev_messages = request.messages[:-1] # Temporarily, the system role does not work as expected. We advise that you write the setups for role-play in your query. # if len(prev_messages) > 0 and prev_messages[0].role == "system": @@ -120,10 +122,18 @@ async def create_chat_completion(request: ChatCompletionRequest): raise HTTPException(status_code=400, detail="Invalid request.") if request.stream: - generate = predict(query, history, request.model) + generate = predict(query, history, request.model, stop_words) return EventSourceResponse(generate, media_type="text/event-stream") - response, _ = model.chat(tokenizer, query, history=history) + if stop_words: + react_stop_words_tokens = [tokenizer.encode(stop_) for stop_ in stop_words] + response, _ = model.chat(tokenizer, query, history=history, stop_words_ids=react_stop_words_tokens) + for stop_ in stop_words: + if response.endswith(stop_): + response = response[:response.find(stop_)] + else: + response, _ = model.chat(tokenizer, query, history=history) + choice_data = ChatCompletionResponseChoice( index=0, message=ChatMessage(role="assistant", content=response), @@ -133,9 +143,9 @@ async def create_chat_completion(request: ChatCompletionRequest): return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion") -async def predict(query: str, history: List[List[str]], model_id: str): +async def predict(query: str, history: List[List[str]], model_id: str, stop_words: List[str]): global model, tokenizer - + assert stop_words == [], "in stream format, stop word is output" choice_data = ChatCompletionResponseStreamChoice( index=0, delta=DeltaMessage(role="assistant"), @@ -145,8 +155,13 @@ async def predict(query: str, history: List[List[str]], model_id: str): yield "{}".format(chunk.model_dump_json(exclude_unset=True)) current_length = 0 + if stop_words: + react_stop_words_tokens = [tokenizer.encode(stop_) for stop_ in stop_words] + response_generator = model.chat_stream(tokenizer, query, history=history, stop_words_ids=react_stop_words_tokens) + else: + response_generator = model.chat_stream(tokenizer, query, history=history) - for new_response in model.chat_stream(tokenizer, query, history): + for new_response in response_generator: if len(new_response) == current_length: continue