Merge branch 'main' into update_ja-docs

2 years ago · 522eaa2a73
parent 2f2ddeb581 80de517162
commit 522eaa2a73
19 changed files with 2018 additions and 1067 deletions
--- a/README.md
+++ b/README.md
@ -1,4 +1,7 @@
-<br>
+<p align="left">
+        <a href="README_CN.md">中文</a>&nbsp ｜ &nbspEnglish&nbsp ｜ &nbsp<a href="README_JA.md">日本語</a>
+</p>
+<br><br>

 <p align="center">
    <img src="assets/logo.jpg" width="400"/>
@ -6,12 +9,9 @@
 <br>

 <p align="center">
-        Qwen-7B <a href="https://modelscope.cn/models/qwen/Qwen-7B/summary">🤖 <a> | <a href="https://huggingface.co/Qwen/Qwen-7B">🤗</a>&nbsp ｜ Qwen-7B-Chat <a href="https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary">🤖 <a>| <a href="https://huggingface.co/Qwen/Qwen-7B-Chat">🤗</a>&nbsp ｜ &nbsp<a href="https://modelscope.cn/studios/qwen/Qwen-7B-Chat-Demo/summary">Demo</a>&nbsp ｜ &nbsp<a href="https://github.com/QwenLM/Qwen-7B/blob/main/tech_memo.md">Report</a>&nbsp&nbsp | &nbsp&nbsp<a href="https://discord.gg/9bjvspyu">Discord</a>
-</p>
+        Qwen-7B <a href="https://modelscope.cn/models/qwen/Qwen-7B/summary">🤖 <a> | <a href="https://huggingface.co/Qwen/Qwen-7B">🤗</a>&nbsp ｜ Qwen-7B-Chat <a href="https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary">🤖 <a> | <a href="https://huggingface.co/Qwen/Qwen-7B-Chat">🤗</a>&nbsp | Qwen-7B-Chat-Int4 <a href="https://huggingface.co/Qwen/Qwen-7B-Chat-Int4">🤗</a>
 <br>
-
-<p align="center">
-        <a href="README_CN.md">中文</a>&nbsp ｜ &nbspEnglish&nbsp ｜ &nbsp<a href="README_JA.md">日本語</a>
+<a href="https://qianwen-res.oss-cn-beijing.aliyuncs.com/qwen_wechat_group.PNG">WeChat</a>&nbsp&nbsp | &nbsp&nbsp<a href="https://discord.gg/z3GAxXZ9Ce">Discord</a>&nbsp&nbsp | &nbsp&nbsp<a href="https://modelscope.cn/studios/qwen/Qwen-7B-Chat-Demo/summary">Demo</a>&nbsp ｜ &nbsp<a href="https://github.com/QwenLM/Qwen-7B/blob/main/tech_memo.md">Report</a>
 </p>
 <br><br>

@ -27,26 +27,27 @@ Qwen-7B is the 7B-parameter version of the large language model series, Qwen (ab

 The following sections include information that you might find it helpful. Specifically, we advise you to read the FAQ section before you launch issues.

-## News
+## News and Updates

-* 2023.8.3 We release both Qwen-7B and Qwen-7B-Chat on ModelScope and Hugging Face. We also provide a technical memo for more details about the model, including training details and model performance.
+* 2023.8.21 We release the Int4 quantized model for Qwen-7B-Chat, **Qwen-7B-Chat-Int4**, which requires low memory costs but achieves improved inference speed. Besides, there is no significant performance degradation on the benchmark evaluation.
+* 2023.8.3 We release both **Qwen-7B** and **Qwen-7B-Chat** on ModelScope and Hugging Face. We also provide a technical memo for more details about the model, including training details and model performance.

 ## Performance

 In general, Qwen-7B outperforms the baseline models of a similar model size, and even outperforms larger models of around 13B parameters, on a series of benchmark datasets, e.g., MMLU, C-Eval, GSM8K, HumanEval, and WMT22, CMMLU, etc., which evaluate the models' capabilities on natural language understanding, mathematic problem solving, coding, etc. See the results below.

-| Model             | MMLU           |         C-Eval |          GSM8K |      HumanEval |  WMT22 (en-zh) |         CMMLU |
-| :---------------- | :------------: | :------------: | :------------: | :------------: | :------------: |:------------: |
-| LLaMA-7B          | 35.1           |              - |           11.0 |           10.5 |            8.7 |             - |
-| LLaMA 2-7B        | 45.3           |              - |           14.6 |           12.8 |           17.9 |             - |
-| Baichuan-7B       | 42.3           |           42.8 |            9.7 |            9.2 |           26.6 |          44.4 |
-| ChatGLM2-6B       | 47.9           |           51.7 |           32.4 |            9.2 |              - |          48.8 |
-| InternLM-7B       | 51.0           |           52.8 |           31.2 |           10.4 |           14.8 |             - |
-| Baichuan-13B      | 51.6           |           53.6 |           26.6 |           12.8 |           30.0 |          55.8 |
-| LLaMA-13B         | 46.9           |           35.5 |           17.8 |           15.8 |           12.0 |             - |
-| LLaMA 2-13B       | 54.8           |              - |           28.7 |           18.3 |           24.2 |             - |
-| ChatGLM2-12B      | 56.2           |       **61.6** |           40.9 |              - |              - |             - |
-| **Qwen-7B**       | **56.7**       |           59.6 |       **51.6** |       **24.4** |       **30.6** |      **58.8** |
+| Model        |   MMLU   |  C-Eval  |  GSM8K  | HumanEval | WMT22 (en-zh) |  CMMLU  |
+| :------------- | :--------: | :--------: | :--------: | :---------: | :-------------: | :--------: |
+| LLaMA-7B     |   35.1   |    -    |   11.0   |   10.5   |      8.7      |    -    |
+| LLaMA 2-7B   |   45.3   |    -    |   14.6   |   12.8   |     17.9     |    -    |
+| Baichuan-7B  |   42.3   |   42.8   |   9.7   |    9.2    |     26.6     |   44.4   |
+| ChatGLM2-6B  |   47.9   |   51.7   |   32.4   |    9.2    |       -       |   48.8   |
+| InternLM-7B  |   51.0   |   52.8   |   31.2   |   10.4   |     14.8     |    -    |
+| Baichuan-13B |   51.6   |   53.6   |   26.6   |   12.8   |     30.0     |   55.8   |
+| LLaMA-13B    |   46.9   |   35.5   |   17.8   |   15.8   |     12.0     |    -    |
+| LLaMA 2-13B  |   54.8   |    -    |   28.7   |   18.3   |     24.2     |    -    |
+| ChatGLM2-12B |   56.2   | **61.6** |   40.9   |     -     |       -       |    -    |
+| **Qwen-7B**  | **56.7** |   59.6   | **51.6** | **24.4** |   **30.6**   | **58.8** |

 <p align="center">
    <img src="assets/performance.png" width="1000"/>
@ -195,93 +196,65 @@ Our tokenizer based on tiktoken is different from other tokenizers, e.g., senten

 ## Quantization

-We provide examples to show how to load models in `NF4` and `Int8`. For starters, make sure you have implemented `bitsandbytes`. Note that the requirements for `bitsandbytes` are:
+### Usage

-```
-**Requirements** Python >=3.8. Linux distribution (Ubuntu, MacOS, etc.) + CUDA > 10.0.
-```
+**Note: we provide a new solution based on [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), and release an Int4 quantized model for Qwen-7B-Chat [Click here](https://huggingface.co/Qwen/Qwen-7B-Chat-Int4), which achieves nearly lossless model effects but improved performance on both memory costs and inference speed, in comparison with the previous solution.**

-Then run the following command to install `bitsandbytes`:
+Here we demonstrate how to use our provided quantized models for inference. Before you start, make sure you meet the requirements of AutoGPTQ and install it from source (temporarily the codes for Qwen are not yet released in the latest version of PyPI package):

+```bash
+git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ
+pip install .
 ```
-pip install bitsandbytes
-```
-
-Windows users should find another option, which might be [bitsandbytes-windows-webui](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels).

-Then you only need to add your quantization configuration to `AutoModelForCausalLM.from_pretrained`. See the example below:
+Then you can load the quantized model easily as shown below:

 ```python
-from transformers import AutoModelForCausalLM, BitsAndBytesConfig
-
-# quantization configuration for NF4 (4 bits)
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type='nf4',
-    bnb_4bit_compute_dtype=torch.bfloat16
-)
+from auto_gptq import AutoGPTQForCausalLM
+model = AutoGPTQForCausalLM.from_quantized("Qwen/Qwen-7B-Chat-Int4", device_map="auto", trust_remote_code=True, use_safetensors=True).eval()
+```

-# quantization configuration for Int8 (8 bits)
-quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+To run inference, it is similar to the basic usage demonstrated above, but remember to pass in the generation configuration explicitly:

-model = AutoModelForCausalLM.from_pretrained(
-    args.checkpoint_path,
-    device_map="cuda:0",
-    quantization_config=quantization_config,
-    max_memory=max_memory,
-    trust_remote_code=True,
-).eval()
+```python
+from transformers import GenerationConfig
+config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat-Int4", trust_remote_code=True)
+response, history = model.chat(tokenizer, "Hi", history=None, generation_config=config)
 ```

-With this method, it is available to load Qwen-7B in `NF4` and `Int8`, which saves you memory usage. We provide related statistics of model performance below. We find that the quantization downgrades the effectiveness slightly but significantly reduces memory costs.
+### Performance

-| Precision   |   MMLU   |  GPU Memory for Loading Model |
-| ----------- | :------: | :---------------------------: |
-|   BF16      |   56.7   |             16.38G            |
-|   Int8      |   52.8   |             10.44G            |
-|    NF4      |   48.9   |             7.79G             |
+We illustrate the model performance of both BF16 and Int4 models on the benchmark, and we find that the quantized model does not suffer from significant performance degradation. Results are shown below:

-Note: The GPU memory usage profiling in the above table is performed on single A100-SXM4-80G GPU, PyTorch 2.0.1 and CUDA 11.8, with flash attention used.
-
-## Inference Efficiency
+| Quantization | MMLU | CEval (val) | GSM8K | Humaneval |
+| -------------- | :----: | :-----------: | :-----: | :---------: |
+| BF16         | 53.9 |    54.2    | 41.1 |   24.4   |
+| Int4         | 52.6 |    52.9    | 38.1 |   23.8   |

 ### Inference Speed

-We measured the average inference speed of generating 2K tokens under BF16 precision and Int8 or NF4 quantization levels, respectively.
+We measured the average inference speed (tokens/s) of generating 2048 and 8192 tokens under BF16 precision and Int4 quantization, respectively.

-| Quantization Level     | Inference Speed with flash_attn (tokens/s) | Inference Speed w/o flash_attn (tokens/s) |
-| ---------------------- | :----------------------------------------: | :---------------------------------------: |
-| BF16 (no quantization) |                   30.06                    |                   27.55                   |
-| Int8 (bnb)             |                    7.94                    |                   7.86                    |
-| NF4 (bnb)              |                   21.43                    |                   20.37                   |
+| Quantization | Speed (2048 tokens) | Speed (8192 tokens) |
+| -------------- | :-------------------: | :-------------------: |
+| BF16         |        30.53        |        28.51        |
+| Int4         |        45.60        |        33.83        |

-In detail, the setting of profiling is generating 2048 new tokens with 1 context token. The profiling runs on single A100-SXM4-80G GPU with PyTorch 2.0.1 and CUDA 11.8. The inference speed is averaged over the generated 2048 tokens.
+In detail, the setting of profiling is generating 8192 new tokens with 1 context token. The profiling runs on a single A100-SXM4-80G GPU with PyTorch 2.0.1 and CUDA 11.4. The inference speed is averaged over the generated 8192 tokens.

 ### GPU Memory Usage

-We also profile the peak GPU memory usage for encoding 2048 tokens as context (and generating single token) and generating 8192 tokens (with single token as context) under BF16 or Int8/NF4 quantization levels, respectively. The results are shown below.
-
-When using flash attention, the memory usage is:
-
-| Quantization Level | Peak Usage for Encoding 2048 Tokens | Peak Usage for Generating 8192 Tokens |
-| ------------------ | :---------------------------------: | :-----------------------------------: |
-| BF16               |               18.11GB               |                23.52GB                |
-| Int8               |               12.17GB               |                17.60GB                |
-| NF4                |               9.52GB                |                14.93GB                |
+We also profile the peak GPU memory usage for encoding 2048 tokens as context (and generating single token) and generating 8192 tokens (with single token as context) under BF16 or Int4 quantization level, respectively. The results are shown below.

-When not using flash attention, the memory usage is:
-
-| Quantization Level | Peak Usage for Encoding 2048 Tokens | Peak Usage for Generating 8192 Tokens |
-| ------------------ | :---------------------------------: | :-----------------------------------: |
-| BF16               |               18.11GB               |                24.40GB                |
-| Int8               |               12.18GB               |                18.47GB                |
-| NF4                |               9.52GB                |                15.81GB                |
+| Quantization | Peak Usage for Encoding 2048 Tokens | Peak Usage for Generating 8192 Tokens |
+| -------------- | :-----------------------------------: | :-------------------------------------: |
+| BF16         |               18.99GB               |                24.40GB                |
+| Int4         |               10.20GB               |                15.61GB                |

 The above speed and memory profiling are conducted using [this script](https://qianwen-res.oss-cn-beijing.aliyuncs.com/profile.py).

 ## Demo

-
 ### Web UI

 We provide code for users to build a web UI demo (thanks to @wysaid). Before you start, make sure you install the following packages:
@ -345,7 +318,8 @@ for chunk in openai.ChatCompletion.create(
    messages=[
        {"role": "user", "content": "你好"}
    ],
-    stream=True
+    stream=True 
+    # Specifying stop words in streaming output format is not yet supported and is under development.
 ):
    if hasattr(chunk.choices[0].delta, "content"):
        print(chunk.choices[0].delta.content, end="", flush=True)
@ -356,7 +330,8 @@ response = openai.ChatCompletion.create(
    messages=[
        {"role": "user", "content": "你好"}
    ],
-    stream=False
+    stream=False,
+    stop=[] # You can add custom stop words here, e.g., stop=["Observation:"] for ReAct prompting.
 )
 print(response.choices[0].message.content)
 ```
@ -371,22 +346,22 @@ print(response.choices[0].message.content)

 Qwen-7B-Chat is specifically optimized for tool usage, including API, database, models, etc., so that users can build their own Qwen-7B-based LangChain, Agent, and Code Interpreter. In our evaluation [benchmark](eval/EVALUATION.md) for assessing tool usage capabilities, we find that Qwen-7B reaches stable performance.

-| Model       | Tool Selection (Acc.↑) | Tool Input (Rouge-L↑)  | False Positive Error↓  |
-|:------------|:----------------------:|:----------------------:|:----------------------:|
-| GPT-4       | 95%                    | **0.90**               | 15%                    |
-| GPT-3.5     | 85%                    | 0.88                   | 75%                    |
-| **Qwen-7B** | **99%**                | 0.89                   | **9.7%**               |
+| Model       | Tool Selection (Acc.↑) | Tool Input (Rouge-L↑) | False Positive Error↓ |
+| :------------ | :-----------------------: | :----------------------: | :----------------------: |
+| GPT-4       |           95%           |        **0.90**        |          15%          |
+| GPT-3.5     |           85%           |          0.88          |          75%          |
+| **Qwen-7B** |         **99%**         |          0.89          |        **9.7%**        |

 For how to write and use prompts for ReAct Prompting, please refer to [the ReAct examples](examples/react_prompt.md). The use of tools can enable the model to better perform tasks.

 Additionally, we provide experimental results to show its capabilities of playing as an agent. See [Hugging Face Agent](https://huggingface.co/docs/transformers/transformers_agents) for more information. Its performance on the run-mode benchmark provided by Hugging Face is as follows:

-| Model          | Tool Selection↑ | Tool Used↑  |   Code↑   |
-|:---------------|:---------------:|:-----------:|:---------:|
-|GPT-4           |     **100**     |   **100**   | **97.41** |
-|GPT-3.5         |      95.37      |    96.30    |   87.04   |
-|StarCoder-15.5B |      87.04      |    87.96    |   68.89   |
-| **Qwen-7B**    |      90.74      |    92.59    |   74.07   |
+| Model           | Tool Selection↑ | Tool Used↑ |  Code↑  |
+| :---------------- | :----------------: | :-----------: | :---------: |
+| GPT-4           |     **100**     |   **100**   | **97.41** |
+| GPT-3.5         |      95.37      |    96.30    |   87.04   |
+| StarCoder-15.5B |      87.04      |    87.96    |   68.89   |
+| **Qwen-7B**     |      90.74      |    92.59    |   74.07   |

 ## Long-Context Understanding

--- a/README_CN.md
+++ b/README_CN.md
@ -1,4 +1,7 @@
-<br>
+<p align="left">
+        中文</a>&nbsp ｜ &nbsp<a href="README.md">English</a>&nbsp ｜ &nbsp<a href="README_JA.md">日本語</a>
+</p>
+<br><br>

 <p align="center">
    <img src="assets/logo.jpg" width="400"/>
@ -6,15 +9,14 @@
 <br>

 <p align="center">
-        Qwen-7B <a href="https://modelscope.cn/models/qwen/Qwen-7B/summary">🤖 <a> | <a href="https://huggingface.co/Qwen/Qwen-7B">🤗</a>&nbsp ｜ Qwen-7B-Chat <a href="https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary">🤖 <a>| <a href="https://huggingface.co/Qwen/Qwen-7B-Chat">🤗</a>&nbsp ｜ &nbsp<a href="https://modelscope.cn/studios/qwen/Qwen-7B-Chat-Demo/summary">Demo</a>&nbsp ｜ &nbsp<a href="https://github.com/QwenLM/Qwen-7B/blob/main/tech_memo.md">Report</a>&nbsp&nbsp | &nbsp&nbsp<a href="https://discord.gg/9bjvspyu">Discord</a>
-</p>
+        Qwen-7B <a href="https://modelscope.cn/models/qwen/Qwen-7B/summary">🤖 <a> | <a href="https://huggingface.co/Qwen/Qwen-7B">🤗</a>&nbsp ｜ Qwen-7B-Chat <a href="https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary">🤖 <a> | <a href="https://huggingface.co/Qwen/Qwen-7B-Chat">🤗</a>&nbsp | Qwen-7B-Chat-Int4 <a href="https://huggingface.co/Qwen/Qwen-7B-Chat-Int4">🤗</a>
 <br>
-
-<p align="center">
-        中文</a>&nbsp ｜ &nbsp<a href="README.md">English</a>&nbsp ｜ &nbsp<a href="README_JA.md">日本語</a>
+<a href="https://qianwen-res.oss-cn-beijing.aliyuncs.com/qwen_wechat_group.PNG">WeChat</a>&nbsp&nbsp | &nbsp&nbsp<a href="https://discord.gg/z3GAxXZ9Ce">Discord</a>&nbsp&nbsp | &nbsp&nbsp<a href="https://modelscope.cn/studios/qwen/Qwen-7B-Chat-Demo/summary">Demo</a>&nbsp ｜ &nbsp<a href="https://github.com/QwenLM/Qwen-7B/blob/main/tech_memo.md">Report</a>
 </p>
 <br><br>

+
+
 我们在🤖 **ModelScope**以及🤗 **Hugging Face**均开源了**Qwen-7B**系列模型。请在本文档顶部点击相关链接查看仓库信息。本仓库主要包括Qwen-7B的简介、使用指南、技术备忘等内容。想了解更多关于模型的信息，请点击[链接](tech_memo.md)查看我们的技术备忘录。

 通义千问-7B（Qwen-7B） 是阿里云研发的通义千问大模型系列的70亿参数规模的模型。Qwen-7B是基于Transformer的大语言模型, 在超大规模的预训练数据上进行训练得到。预训练数据类型多样，覆盖广泛，包括大量网络文本、专业书籍、代码等。同时，在Qwen-7B的基础上，我们使用对齐机制打造了基于大语言模型的AI助手Qwen-7B-Chat。Qwen-7B系列模型的特点包括：
@ -29,6 +31,7 @@

 ## 新闻

+* 2023年8月21日 发布Qwen-7B-Chat的Int4量化模型，Qwen-7B-Chat-Int4。该模型显存占用低，推理速度相比半精度模型显著提升，在基准评测上效果损失较小。
 * 2023年8月3日 在魔搭社区（ModelScope）和Hugging Face同步推出Qwen-7B和Qwen-7B-Chat模型。同时，我们发布了技术备忘录，介绍了相关的训练细节和模型表现。

 ## 评测表现
@ -198,89 +201,62 @@ print(f'Response: {response}')

 ## 量化

-如希望使用更低精度的量化模型，如4比特和8比特的模型，我们提供了简单的示例来说明如何快速使用量化模型。在开始前，确保你已经安装了`bitsandbytes`。请注意，`bitsandbytes`的安装要求是：
+### 用法

-```
-**Requirements** Python >=3.8. Linux distribution (Ubuntu, MacOS, etc.) + CUDA > 10.0.
-```
+**请注意：我们更新量化方案为基于[AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)的量化，提供Qwen-7B-Chat的Int4量化模型[点击这里](https://huggingface.co/Qwen/Qwen-7B-Chat-Int4)。相比此前方案，该方案在模型评测效果几乎无损，且存储需求更低，推理速度更优。**

-随后运行如下命令安装`bitsandbytes`:
+以下我们提供示例说明如何使用Int4量化模型。在开始使用前，请先保证满足AutoGPTQ的要求，并使用源代码安装（由于最新支持Qwen的代码未发布到PyPI）：

+```bash
+git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ
+pip install .
 ```
-pip install bitsandbytes
-```
-
-Windows用户需安装特定版本的`bitsandbytes`，可选项包括[bitsandbytes-windows-webui](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels)。

-你只需要在`AutoModelForCausalLM.from_pretrained`中添加你的量化配置，即可使用量化模型。如下所示：
+随后便能轻松读取量化模型：

 ```python
-from transformers import AutoModelForCausalLM, BitsAndBytesConfig
-
-# quantization configuration for NF4 (4 bits)
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type='nf4',
-    bnb_4bit_compute_dtype=torch.bfloat16
-)
+from auto_gptq import AutoGPTQForCausalLM
+model = AutoGPTQForCausalLM.from_quantized("Qwen/Qwen-7B-Chat-Int4", device_map="auto", trust_remote_code=True, use_safetensors=True).eval()
+```

-# quantization configuration for Int8 (8 bits)
-quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+推理方法和基础用法类似，但注意需要从外部传入generation config：

-model = AutoModelForCausalLM.from_pretrained(
-    args.checkpoint_path,
-    device_map="cuda:0",
-    quantization_config=quantization_config,
-    max_memory=max_memory,
-    trust_remote_code=True,
-).eval()
+```python
+from transformers import GenerationConfig
+config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat-Int4", trust_remote_code=True)
+response, history = model.chat(tokenizer, "Hi", history=None, generation_config=config)
 ```

-上述方法可以让我们将模型量化成`NF4`和`Int8`精度的模型进行读取，帮助我们节省显存开销。我们也提供了相关性能数据。我们发现尽管模型在效果上存在损失，但模型的显存开销大幅降低。
-
-| Precision   |   MMLU   |  GPU Memory for Loading Model |
-| ----------- | :------: | :---------------------------: |
-|   BF16      |   56.7   |             16.38G            |
-|   Int8      |   52.8   |             10.44G            |
-|    NF4      |   48.9   |             7.79G             |
+### 效果评测

-注：表中显存占用的测试环境为A100-SXM4-80G单卡，PyTorch 2.0.1，CUDA 11.8，开启flash attention
+我们对BF16和Int4模型在基准评测上做了测试，发现量化模型效果损失较小，结果如下所示：

-## 推理性能
+|  Quantization |   MMLU     |  CEval (val) |  GSM8K |  Humaneval |
+| ------------- | :--------: | :----------: | :----: | :--------: |
+| BF16          |    53.9    |     54.2     |  41.1  |    24.4    |
+| Int4          |    52.6    |     52.9     |  38.1  |    23.8    |

 ### 推理速度

-我们分别测试了BF16和量化条件下，模型生成2K tokens的平均推理速度，结果如下
-
-| 量化等级  | 开flash_attn的推理速度 (字符/秒) | 关flash_attn的推理速度 (字符/秒) |
-| ------ | :---------------------------: | :---------------------------: |
-| BF16 (无量化) | 30.06 | 27.55 |
-| Int8 (bnb) | 7.94 | 7.86 |
-| NF4 (bnb) | 21.43 | 20.37 |
+我们测算了BF16和Int4模型生成2048和8192个token的平均推理速度（tokens/s）。如图所示：

-具体的评测方式为：指定输入context长度为1，生成长度为2048；测试硬件为A100-SXM4-80G单卡，软件环境为PyTorch 2.0.1，CUDA版本11.8，计算生成该2048序列的平均速度
+|  Quantization | Speed (2048 tokens) | Speed (8192 tokens) |
+| ------------- | :------------------:| :------------------:|
+|      BF16     | 30.53               | 28.51               |
+|      Int4     | 45.60               | 33.83               |

-### 显存占用
+具体而言，我们记录在长度为1的上下文的条件下生成8192个token的性能。评测运行于单张A100-SXM4-80G GPU，使用PyTorch 2.0.1和CUDA 11.4。推理速度是生成8192个token的速度均值。

-在BF16和不同量化条件下，我们分别测算了模型编码2048长度序列（并生成1个token），和生成8192长度序列（编码1个token作为context）的峰值显存占用。结果如下
+### 显存使用

-打开flash attention时
+我们还测算了BF16和Int4模型编码2048个token及生成8192个token的峰值显存占用情况。结果如下所示：

-| 量化等级 | 编码 2048 长度的峰值显存 | 生成 8192 长度的峰值显存 |
-| --- | :---: | :---: |
-| BF16 | 18.11GB | 23.52GB |
-| Int8 | 12.17GB | 17.60GB |
-| NF4 | 9.52GB | 14.93GB |
+| Quantization Level | Peak Usage for Encoding 2048 Tokens | Peak Usage for Generating 8192 Tokens |
+| ------------------ | :---------------------------------: | :-----------------------------------: |
+| BF16               |               18.99GB               |                24.40GB                |
+| Int4               |               10.20GB                |                15.61GB                |

-关闭flash attention时
-
-| 量化等级 | 编码 2048 长度的峰值显存 | 生成 8192 长度的峰值显存 |
-| --- | :---: | :---: |
-| BF16 | 18.11GB | 24.40GB |
-| Int8 | 12.18GB | 18.47GB |
-| NF4 | 9.52GB | 15.81GB |
-
-以上测速和显存占用情况，均可通过该[评测脚本](https://qianwen-res.oss-cn-beijing.aliyuncs.com/profile.py)测算得到。
+上述性能测算使用[此脚本](https://qianwen-res.oss-cn-beijing.aliyuncs.com/profile.py)完成。

 ## Demo

@ -304,7 +280,6 @@ python web_demo.py
    <br>
 <p>

-
 ### 交互式Demo

 我们提供了一个简单的交互式Demo示例，请查看`cli_demo.py`。当前模型已经支持流式输出，用户可通过输入文字的方式和Qwen-7B-Chat交互，模型将流式输出返回结果。运行如下命令：
@ -349,6 +324,7 @@ for chunk in openai.ChatCompletion.create(
        {"role": "user", "content": "你好"}
    ],
    stream=True
+    # 流式输出的自定义stopwords功能尚未支持，正在开发中
 ):
    if hasattr(chunk.choices[0].delta, "content"):
        print(chunk.choices[0].delta.content, end="", flush=True)
@ -359,7 +335,8 @@ response = openai.ChatCompletion.create(
    messages=[
        {"role": "user", "content": "你好"}
    ],
-    stream=False
+    stream=False,
+    stop=[] # 在此处添加自定义的stop words 例如ReAct prompting时需要增加： stop=["Observation:"]。
 )
 print(response.choices[0].message.content)
 ```
--- a/README_JA.md
+++ b/README_JA.md
@ -1,23 +1,23 @@
-<br>
+<p align="left">
+        <a href="README_CN.md">中文</a>&nbsp ｜ &nbsp<a href="README.md">English</a>&nbsp ｜ &nbsp日本語
+</p>
+<br><br>

 <p align="center">
    <img src="assets/logo.jpg" width="400"/>
 <p>
-<br>

 <p align="center">
-        Qwen-7B <a href="https://modelscope.cn/models/qwen/Qwen-7B/summary">🤖 <a> | <a href="https://huggingface.co/Qwen/Qwen-7B">🤗</a>&nbsp ｜ Qwen-7B-Chat <a href="https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary">🤖 <a>| <a href="https://huggingface.co/Qwen/Qwen-7B-Chat">🤗</a>&nbsp ｜ &nbsp<a href="https://modelscope.cn/studios/qwen/Qwen-7B-Chat-Demo/summary">Demo</a>&nbsp ｜ &nbsp<a href="https://github.com/QwenLM/Qwen-7B/blob/main/tech_memo.md">Report</a>&nbsp&nbsp | &nbsp&nbsp<a href="https://discord.gg/9bjvspyu">Discord</a>
+        Qwen-7B <a href="https://modelscope.cn/models/qwen/Qwen-7B/summary">🤖 <a> | <a href="https://huggingface.co/Qwen/Qwen-7B">🤗</a>&nbsp ｜ Qwen-7B-Chat <a href="https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary">🤖 <a> | <a href="https://huggingface.co/Qwen/Qwen-7B-Chat">🤗</a>&nbsp | Qwen-7B-Chat-Int4 <a href="https://huggingface.co/Qwen/Qwen-7B-Chat-Int4">🤗</a>
+<br>
+<a href="https://qianwen-res.oss-cn-beijing.aliyuncs.com/qwen_wechat_group.PNG">WeChat</a>&nbsp&nbsp | &nbsp&nbsp<a href="https://discord.gg/z3GAxXZ9Ce">Discord</a>&nbsp&nbsp | &nbsp&nbsp<a href="https://modelscope.cn/studios/qwen/Qwen-7B-Chat-Demo/summary">Demo</a>&nbsp ｜ &nbsp<a href="https://github.com/QwenLM/Qwen-7B/blob/main/tech_memo.md">Report</a>
 </p>
 <br>

-<p align="center">
-        <a href="README_CN.md">中文</a>&nbsp ｜ &nbsp<a href="README.md">English</a>&nbsp ｜ &nbsp日本語
-</p>
-<br><br>
-<p align="right">
+<p align="left">
        日本語ドキュメントメンテナー: <a href="https://github.com/eltociear">Ikko Eltociear Ashimine</a>
 </p>
-<br><br>
+<br>

 私たちは、**Qwen-7B** と **Qwen-7B-Chat** を **🤖 ModelScope** と **🤗 Hugging Face** の両方でオープンソース化しています(上部のロゴをクリックすると、コードとチェックポイントのあるリポジトリに移動します)。このレポには、Qwen-7B の簡単な紹介と、使い方の手引き、さらに詳しい情報を提供する技術メモ [link](tech_memo.md) が含まれています。

@ -33,6 +33,7 @@ Qwen-7B は、アリババクラウドが提唱する大規模言語モデルシ

 ## ニュース

+* 2023.8.21 Qwen-7B-Chat 用 Int4 量子化モデル(**Qwen-7B-Chat-Int4**)をリリースしました。メモリコストは低いが、推論速度は向上している。また、ベンチマーク評価において大きな性能劣化はありません。
 * 2023.8.3 Qwen-7B と Qwen-7B-Chat を ModelScope と Hugging Face で公開。また、トレーニングの詳細やモデルの性能など、モデルの詳細についてはテクニカルメモを提供しています。

 ## パフォーマンス
@ -199,89 +200,62 @@ tiktoken に基づくトークナイザーは、他のトークナイザー、

 ## 量子化

-`NF4` と `Int8` のモデルをロードする方法を示す例を提供します。手始めに、`bitsandbytes` が実装されていることを確認して下さい。`bitsandbytes` の要件は以下の通りになります:
+### 使用方法

-```
-**必要条件** Python >= 3.8。Linux ディストリビューション（Ubuntu、MacOS など）+ CUDA > 10.0。
-```
+**注：[AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)に基づく新しい解決策を提供し、Qwen-7B-Chat用のInt4量子化モデル[ここをクリック](https://huggingface.co/Qwen/Qwen-7B-Chat-Int4)をリリースしました。このモデルは、従来の解決策と比較して、ほぼ無損失のモデル効果を達成しつつ、メモリコストと推論速度の両方で性能が向上しています**。

-そして、以下のコマンドを実行して `bitsandbytes` をインストールする：
+ここでは、量子化されたモデルを推論に使用する方法を示します。始める前に、AutoGPTQの要件を満たしていることを確認し、ソースからインストールしてください（一時的にQwenのコードは最新版のPyPIパッケージではまだリリースされていません）：

-```
-pip install bitsandbytes
+```bash
+git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ
+pip install .
 ```

-Windows ユーザは、[bitsandbytes-windows-webui](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels) という別のオプションを見つける必要があります。
-
-そして、量子化の設定を `AutoModelForCausalLM.from_pretrained` に追加するだけとなります。以下の例を参照してください:
+そうすれば、以下のように簡単に量子化モデルを読み込むことができる。

 ```python
-from transformers import AutoModelForCausalLM, BitsAndBytesConfig
-
-# NF4（4 ビット）の量子化設定
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type='nf4',
-    bnb_4bit_compute_dtype=torch.bfloat16
-)
+from auto_gptq import AutoGPTQForCausalLM
+model = AutoGPTQForCausalLM.from_quantized("Qwen/Qwen-7B-Chat-Int4", device_map="auto", trust_remote_code=True, use_safetensors=True).eval()
+```

-# Int8（8 ビット）の量子化設定
-quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+推論を実行するには、上で示した基本的な使い方に似ているが、generation configurationを明示的に渡すことを忘れないこと：

-model = AutoModelForCausalLM.from_pretrained(
-    args.checkpoint_path,
-    device_map="cuda:0",
-    quantization_config=quantization_config,
-    max_memory=max_memory,
-    trust_remote_code=True,
-).eval()
+```python
+from transformers import GenerationConfig
+config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat-Int4", trust_remote_code=True)
+response, history = model.chat(tokenizer, "Hi", history=None, generation_config=config)
 ```

-この方法では、Qwen-7B を `NF4` と `Int8` でロードすることができ、メモリ使用量を節約できる。以下にモデル性能の関連統計量を示します。量子化により、有効性は若干低下するが、推論効率は大幅に向上し、メモリコストが削減されることがわかります。
-
-| Precision   |   MMLU   |  GPU Memory for Loading Model |
-| ----------- | :------: | :---------------------------: |
-|   BF16      |   56.7   |             16.38G            |
-|   Int8      |   52.8   |             10.44G            |
-|    NF4      |   48.9   |             7.79G             |
+### 性能

-注: 上表の GPU メモリ使用量プロファイリングは、シングル A100-SXM4-80G GPU、PyTorch 2.0.1、CUDA 11.8、flash attention 使用で実行されています。
+ベンチマークにおけるBF16モデルとInt4モデルの性能について説明する。結果を以下に示します：

-## 推論効率
+|  Quantization |   MMLU     |  CEval (val) |  GSM8K |  Humaneval |
+| ------------- | :--------: | :----------: | :----: | :--------: |
+| BF16          |    53.9    |     54.2     |  41.1  |    24.4    |
+| Int4          |    52.6    |     52.9     |  38.1  |    23.8    |

 ### 推論スピード

-BF16 精度、量子化レベル Int8 または NF4 で、それぞれ2Kトークンを生成する平均推論速度を測定しました。
+BF16の精度とInt4の量子化レベルの下で、それぞれ2048個と8192個のトークンを生成する平均推論速度(tokens/s)を測定した。

-| 量子化レベル | flash_attn による推論速度（トークン/秒） | flash_attn を使用しない場合の推論速度（トークン/秒） |
-| ------ | :---------------------------: | :---------------------------: |
-| BF16 (no quantization) | 30.06 | 27.55 |
-| Int8 (bnb) | 7.94 | 7.86 |
-| NF4 (bnb) | 21.43 | 20.37 |
+|  Quantization | Speed (2048 tokens) | Speed (8192 tokens) |
+| ------------- | :------------------:| :------------------:|
+|      BF16     | 30.53               | 28.51               |
+|      Int4     | 45.60               | 33.83               |

-詳細には、プロファイリングの設定は、1 コンテクスト・トークンで 2048 の新しいトークンを生成している。プロファイリングは、PyTorch 2.0.1 と CUDA 11.8 を搭載したシングル A100-SXM4-80G GPU で実行される。推論速度は生成された 2048 個のトークンの平均です。
+詳細には、プロファイリングの設定は、1コンテクスト・トークンで8192個の新しいトークンを生成している。プロファイリングは、PyTorch 2.0.1とCUDA 11.4を搭載したシングルA100-SXM4-80G GPUで実行される。推論速度は生成された8192個のトークンの平均値です。

 ### GPU メモリ使用量

-また、BF16 または Int8/NF4 量子化レベルの下で、2048 個のトークンをコンテキストとしてエンコードした場合（および単一のトークンを生成した場合）と、8192 個のトークンを生成した場合（単一のトークンをコンテキストとして生成した場合）の GPU メモリ使用量のピーク値をそれぞれプロファイリングしました。結果を以下に示す。
-
-Flash attention を使用した場合のメモリ使用量は以下の通りである：
-
-| 量子化レベル | 2048 トークンをエンコードする際のピーク使用量 | 8192 トークン生成のピーク使用量 |
-| --- | :---: | :---: |
-| BF16 | 18.11GB | 23.52GB |
-| Int8 | 12.17GB | 17.60GB |
-| NF4 | 9.52GB | 14.93GB |
-
-Flash attention を使用しない場合、メモリ使用量は次のようになる：
+また、BF16またはInt4の量子化レベルで、それぞれ2048トークンをコンテキストとしてエンコードした場合（および単一のトークンを生成した場合）と、8192トークンを生成した場合（単一のトークンをコンテキストとして生成した場合）のGPUメモリ使用量のピーク値をプロファイリングしました。その結果を以下に示します。

-| 量子化レベル | 2048 トークンをエンコードする際のピーク使用量 | 8192 トークン生成のピーク使用量 |
-| --- | :---: | :---: |
-| BF16 | 18.11GB | 24.40GB |
-| Int8 | 12.18GB | 18.47GB |
-| NF4 | 9.52GB | 15.81GB |
+| Quantization Level | Peak Usage for Encoding 2048 Tokens | Peak Usage for Generating 8192 Tokens |
+| ------------------ | :---------------------------------: | :-----------------------------------: |
+| BF16               |               18.99GB               |                24.40GB                |
+| Int4               |               10.20GB                |                15.61GB                |

-上記のスピードとメモリーのプロファイリングは、[このスクリプト](https://qianwen-res.oss-cn-beijing.aliyuncs.com/profile.py)を使って行われました。
+上記のスピードとメモリーのプロファイリングは、[このスクリプト](https://qianwen-res.oss-cn-beijing.aliyuncs.com/profile.py)を使用しています。

 ## デモ

--- a/eval/EVALUATION.md
+++ b/eval/EVALUATION.md
@ -34,6 +34,19 @@ pip install thefuzz
 python evaluate_chat_mmlu.py -d data/mmlu/data/
 ```

+- CMMLU
+
+```Shell
+wget https://huggingface.co/datasets/haonan-li/cmmlu/resolve/main/cmmlu_v1_0_1.zip
+mkdir data/cmmlu
+mv cmmlu_v1_0_1.zip data/cmmlu
+cd data/cmmlu; unzip cmmlu_v1_0_1.zip
+cd ../../
+
+# Qwen-7B
+python evaluate_cmmlu.py -d data/cmmlu/
+```
+
 - HumanEval

 Get the HumanEval.jsonl file from [here](https://github.com/openai/human-eval/tree/master/data)
--- a/eval/evaluate_ceval.py
+++ b/eval/evaluate_ceval.py
@ -1,14 +1,13 @@
 import os
-import pandas as pd
-import numpy as np
+from typing import List
 import argparse
-import datasets
 import torch
-
-from typing import List
+import pandas as pd
+import numpy as np
 from tqdm import tqdm
 from transformers.trainer_utils import set_seed
-
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig

 '''
 wget https://huggingface.co/datasets/ceval/ceval-exam/resolve/main/ceval-exam.zip
@ -20,29 +19,32 @@ python evaluate_ceval.py -d data/ceval/
 '''

 def load_models_tokenizer(args):
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-    from transformers.generation import GenerationConfig
-
-    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval()
-    model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint_path, device_map="auto", trust_remote_code=True
+    ).eval()
+    model.generation_config = GenerationConfig.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True
+    )
    return model, tokenizer


 def format_example(line, include_answer=True):
-    example = '问题：' + line['question']
+    example = "问题：" + line["question"]
    for choice in choices:
        example += f'\n{choice}. {line[f"{choice}"]}'
-   
+
    if include_answer:
-        example += '\n答案：' + line["answer"] + '\n\n'
+        example += "\n答案：" + line["answer"] + "\n\n"
    else:
-        example += '\n答案：'
+        example += "\n答案："
    return example


 def generate_few_shot_prompt(k, subject, dev_df):
-    prompt = ''
+    prompt = ""
    if k == -1:
        k = dev_df.shape[0]
    for i in range(k):
@ -54,35 +56,37 @@ def generate_few_shot_prompt(k, subject, dev_df):


 def get_logits(tokenizer, model, inputs: List[str]):
-    input_ids = tokenizer(inputs, padding=False)['input_ids']
+    input_ids = tokenizer(inputs, padding=False)["input_ids"]
    input_ids = torch.tensor(input_ids, device=model.device)
-    tokens = {'input_ids': input_ids}
+    tokens = {"input_ids": input_ids}

-    outputs = model(input_ids)['logits']
+    outputs = model(input_ids)["logits"]
    logits = outputs[:, -1, :]
    log_probs = torch.nn.functional.softmax(logits, dim=-1)
-    return log_probs, {'tokens': tokens}
+    return log_probs, {"tokens": tokens}


@torch.no_grad()
 def eval_subject(
-        model,
-        tokenizer,
-        subject_name,
-        test_df,
-        k=5,
-        dev_df=None,
-        few_shot=False,
-        save_result_dir=None,
-        **kwargs
+    model,
+    tokenizer,
+    subject_name,
+    test_df,
+    k=5,
+    dev_df=None,
+    few_shot=False,
+    save_result_dir=None,
+    **kwargs,
 ):
    result = []
    score = []

-    few_shot_prompt = generate_few_shot_prompt(
-        k, subject_name, dev_df) if few_shot else ''
-    all_probs = {'prob_A': [], 'prob_B': [], 'prob_C': [], 'prob_D': []}
-    if args.debug: print(f"few_shot_prompt: {few_shot_prompt}")
+    few_shot_prompt = (
+        generate_few_shot_prompt(k, subject_name, dev_df) if few_shot else ""
+    )
+    all_probs = {"prob_A": [], "prob_B": [], "prob_C": [], "prob_D": []}
+    if args.debug:
+        print(f"few_shot_prompt: {few_shot_prompt}")

    for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
        question = format_example(row, include_answer=False)
@ -93,44 +97,49 @@ def eval_subject(
        logits = output.flatten()

        softval = torch.nn.functional.softmax(
-                torch.tensor(
-                    [
-                        logits[tokenizer("A")['input_ids']],
-                        logits[tokenizer("B")['input_ids']],
-                        logits[tokenizer("C")['input_ids']],
-                        logits[tokenizer("D")['input_ids']],
-                    ]
-                ),
-                dim=0,
-            )
+            torch.tensor(
+                [
+                    logits[tokenizer("A")["input_ids"]],
+                    logits[tokenizer("B")["input_ids"]],
+                    logits[tokenizer("C")["input_ids"]],
+                    logits[tokenizer("D")["input_ids"]],
+                ]
+            ),
+            dim=0,
+        )
        if softval.dtype in {torch.bfloat16, torch.float16}:
            softval = softval.to(dtype=torch.float32)
        probs = softval.detach().cpu().numpy()

        for i, choice in enumerate(choices):
-            all_probs[f'prob_{choice}'].append(probs[i])
+            all_probs[f"prob_{choice}"].append(probs[i])
        pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)]
-        
-        if 'answer' in row:
-            correct = 1 if pred == row['answer'] else 0
+
+        if "answer" in row:
+            correct = 1 if pred == row["answer"] else 0
            score.append(correct)
-            if args.debug: print(f'{question} pred: {pred} ref: {row["answer"]}')
+            if args.debug:
+                print(f'{question} pred: {pred} ref: {row["answer"]}')
        result.append(pred)

    if score:
        correct_ratio = 100 * sum(score) / len(score)
-        if args.debug: print(subject_name, correct_ratio)
+        if args.debug:
+            print(subject_name, correct_ratio)
    else:
        correct_ratio = 0
    if save_result_dir:
-        test_df['model_output'] = result
+        test_df["model_output"] = result
        for i, choice in enumerate(choices):
-            test_df[f'prob_{choice}'] = (all_probs[f'prob_{choice}'])
+            test_df[f"prob_{choice}"] = all_probs[f"prob_{choice}"]
        if score:
            test_df["correctness"] = score
        os.makedirs(save_result_dir, exist_ok=True)
-        test_df.to_csv(os.path.join(
-            save_result_dir, f'{subject_name}_result.csv'), encoding="utf-8", index=False)
+        test_df.to_csv(
+            os.path.join(save_result_dir, f"{subject_name}_result.csv"),
+            encoding="utf-8",
+            index=False,
+        )

    return correct_ratio

@ -139,125 +148,285 @@ def cal_ceval(res):
    acc_sum_dict = dict()
    acc_norm_sum_dict = dict()
    cnt_dict = dict()
-    acc_sum = 0.
+    acc_sum = 0.0
    cnt = 0
    hard_cnt = 0
-    hard_acc_sum = 0.
+    hard_acc_sum = 0.0
    for tt in res.keys():
-        name = tt.split('-')[-1]
+        name = tt.split("-")[-1]
        acc_sum += float(res[tt])
        cnt += 1
        class_ = TASK_NAME_MAPPING[name][2]
        if class_ not in acc_sum_dict:
-            acc_sum_dict[class_] = 0.
-            acc_norm_sum_dict[class_] = 0.
-            cnt_dict[class_] = 0.
+            acc_sum_dict[class_] = 0.0
+            acc_norm_sum_dict[class_] = 0.0
+            cnt_dict[class_] = 0.0
        if name in hard_list:
            hard_cnt += 1
            hard_acc_sum += float(res[tt])
        acc_sum_dict[class_] += float(res[tt])
        cnt_dict[class_] += 1
-    print('\n\n\n')
-    for k in ['STEM', 'Social Science', 'Humanities', 'Other']:
+    print("\n\n\n")
+    for k in ["STEM", "Social Science", "Humanities", "Other"]:
        if k in cnt_dict:
-            print('%s acc: %.2f ' % (
-                k, acc_sum_dict[k] / cnt_dict[k]))
+            print("%s acc: %.2f " % (k, acc_sum_dict[k] / cnt_dict[k]))
    if hard_cnt > 0:
-        print('Hard acc:%.2f ' % (hard_acc_sum / hard_cnt))
-    print('AVERAGE acc:%.2f ' % (acc_sum / cnt))
+        print("Hard acc:%.2f " % (hard_acc_sum / hard_cnt))
+    print("AVERAGE acc:%.2f " % (acc_sum / cnt))


 TASK_NAME_MAPPING = {
    "computer_network": ["Computer Network", "\u8ba1\u7b97\u673a\u7f51\u7edc", "STEM"],
    "operating_system": ["Operating System", "\u64cd\u4f5c\u7cfb\u7edf", "STEM"],
-    "computer_architecture": ["Computer Architecture", "\u8ba1\u7b97\u673a\u7ec4\u6210", "STEM"],
+    "computer_architecture": [
+        "Computer Architecture",
+        "\u8ba1\u7b97\u673a\u7ec4\u6210",
+        "STEM",
+    ],
    "college_programming": ["College Programming", "\u5927\u5b66\u7f16\u7a0b", "STEM"],
    "college_physics": ["College Physics", "\u5927\u5b66\u7269\u7406", "STEM"],
    "college_chemistry": ["College Chemistry", "\u5927\u5b66\u5316\u5b66", "STEM"],
-    "advanced_mathematics": ["Advanced Mathematics", "\u9ad8\u7b49\u6570\u5b66", "STEM"],
-    "probability_and_statistics": ["Probability and Statistics", "\u6982\u7387\u7edf\u8ba1", "STEM"],
-    "discrete_mathematics": ["Discrete Mathematics", "\u79bb\u6563\u6570\u5b66", "STEM"],
-    "electrical_engineer": ["Electrical Engineer", "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08", "STEM"],
-    "metrology_engineer": ["Metrology Engineer", "\u6ce8\u518c\u8ba1\u91cf\u5e08", "STEM"],
-    "high_school_mathematics": ["High School Mathematics", "\u9ad8\u4e2d\u6570\u5b66", "STEM"],
+    "advanced_mathematics": [
+        "Advanced Mathematics",
+        "\u9ad8\u7b49\u6570\u5b66",
+        "STEM",
+    ],
+    "probability_and_statistics": [
+        "Probability and Statistics",
+        "\u6982\u7387\u7edf\u8ba1",
+        "STEM",
+    ],
+    "discrete_mathematics": [
+        "Discrete Mathematics",
+        "\u79bb\u6563\u6570\u5b66",
+        "STEM",
+    ],
+    "electrical_engineer": [
+        "Electrical Engineer",
+        "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08",
+        "STEM",
+    ],
+    "metrology_engineer": [
+        "Metrology Engineer",
+        "\u6ce8\u518c\u8ba1\u91cf\u5e08",
+        "STEM",
+    ],
+    "high_school_mathematics": [
+        "High School Mathematics",
+        "\u9ad8\u4e2d\u6570\u5b66",
+        "STEM",
+    ],
    "high_school_physics": ["High School Physics", "\u9ad8\u4e2d\u7269\u7406", "STEM"],
-    "high_school_chemistry": ["High School Chemistry", "\u9ad8\u4e2d\u5316\u5b66", "STEM"],
+    "high_school_chemistry": [
+        "High School Chemistry",
+        "\u9ad8\u4e2d\u5316\u5b66",
+        "STEM",
+    ],
    "high_school_biology": ["High School Biology", "\u9ad8\u4e2d\u751f\u7269", "STEM"],
-    "middle_school_mathematics": ["Middle School Mathematics", "\u521d\u4e2d\u6570\u5b66", "STEM"],
-    "middle_school_biology": ["Middle School Biology", "\u521d\u4e2d\u751f\u7269", "STEM"],
-    "middle_school_physics": ["Middle School Physics", "\u521d\u4e2d\u7269\u7406", "STEM"],
-    "middle_school_chemistry": ["Middle School Chemistry", "\u521d\u4e2d\u5316\u5b66", "STEM"],
+    "middle_school_mathematics": [
+        "Middle School Mathematics",
+        "\u521d\u4e2d\u6570\u5b66",
+        "STEM",
+    ],
+    "middle_school_biology": [
+        "Middle School Biology",
+        "\u521d\u4e2d\u751f\u7269",
+        "STEM",
+    ],
+    "middle_school_physics": [
+        "Middle School Physics",
+        "\u521d\u4e2d\u7269\u7406",
+        "STEM",
+    ],
+    "middle_school_chemistry": [
+        "Middle School Chemistry",
+        "\u521d\u4e2d\u5316\u5b66",
+        "STEM",
+    ],
    "veterinary_medicine": ["Veterinary Medicine", "\u517d\u533b\u5b66", "STEM"],
-    "college_economics": ["College Economics", "\u5927\u5b66\u7ecf\u6d4e\u5b66", "Social Science"],
-    "business_administration": ["Business Administration", "\u5de5\u5546\u7ba1\u7406", "Social Science"],
-    "marxism": ["Marxism", "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406", "Social Science"],
-    "mao_zedong_thought": ["Mao Zedong Thought", "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba", "Social Science"],
+    "college_economics": [
+        "College Economics",
+        "\u5927\u5b66\u7ecf\u6d4e\u5b66",
+        "Social Science",
+    ],
+    "business_administration": [
+        "Business Administration",
+        "\u5de5\u5546\u7ba1\u7406",
+        "Social Science",
+    ],
+    "marxism": [
+        "Marxism",
+        "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406",
+        "Social Science",
+    ],
+    "mao_zedong_thought": [
+        "Mao Zedong Thought",
+        "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba",
+        "Social Science",
+    ],
    "education_science": ["Education Science", "\u6559\u80b2\u5b66", "Social Science"],
-    "teacher_qualification": ["Teacher Qualification", "\u6559\u5e08\u8d44\u683c", "Social Science"],
-    "high_school_politics": ["High School Politics", "\u9ad8\u4e2d\u653f\u6cbb", "Social Science"],
-    "high_school_geography": ["High School Geography", "\u9ad8\u4e2d\u5730\u7406", "Social Science"],
-    "middle_school_politics": ["Middle School Politics", "\u521d\u4e2d\u653f\u6cbb", "Social Science"],
-    "middle_school_geography": ["Middle School Geography", "\u521d\u4e2d\u5730\u7406", "Social Science"],
-    "modern_chinese_history": ["Modern Chinese History", "\u8fd1\u4ee3\u53f2\u7eb2\u8981", "Humanities"],
-    "ideological_and_moral_cultivation": ["Ideological and Moral Cultivation", "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840", "Humanities"],
+    "teacher_qualification": [
+        "Teacher Qualification",
+        "\u6559\u5e08\u8d44\u683c",
+        "Social Science",
+    ],
+    "high_school_politics": [
+        "High School Politics",
+        "\u9ad8\u4e2d\u653f\u6cbb",
+        "Social Science",
+    ],
+    "high_school_geography": [
+        "High School Geography",
+        "\u9ad8\u4e2d\u5730\u7406",
+        "Social Science",
+    ],
+    "middle_school_politics": [
+        "Middle School Politics",
+        "\u521d\u4e2d\u653f\u6cbb",
+        "Social Science",
+    ],
+    "middle_school_geography": [
+        "Middle School Geography",
+        "\u521d\u4e2d\u5730\u7406",
+        "Social Science",
+    ],
+    "modern_chinese_history": [
+        "Modern Chinese History",
+        "\u8fd1\u4ee3\u53f2\u7eb2\u8981",
+        "Humanities",
+    ],
+    "ideological_and_moral_cultivation": [
+        "Ideological and Moral Cultivation",
+        "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840",
+        "Humanities",
+    ],
    "logic": ["Logic", "\u903b\u8f91\u5b66", "Humanities"],
    "law": ["Law", "\u6cd5\u5b66", "Humanities"],
-    "chinese_language_and_literature": ["Chinese Language and Literature", "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66", "Humanities"],
+    "chinese_language_and_literature": [
+        "Chinese Language and Literature",
+        "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66",
+        "Humanities",
+    ],
    "art_studies": ["Art Studies", "\u827a\u672f\u5b66", "Humanities"],
-    "professional_tour_guide": ["Professional Tour Guide", "\u5bfc\u6e38\u8d44\u683c", "Humanities"],
-    "legal_professional": ["Legal Professional", "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c", "Humanities"],
-    "high_school_chinese": ["High School Chinese", "\u9ad8\u4e2d\u8bed\u6587", "Humanities"],
-    "high_school_history": ["High School History", "\u9ad8\u4e2d\u5386\u53f2", "Humanities"],
-    "middle_school_history": ["Middle School History", "\u521d\u4e2d\u5386\u53f2", "Humanities"],
+    "professional_tour_guide": [
+        "Professional Tour Guide",
+        "\u5bfc\u6e38\u8d44\u683c",
+        "Humanities",
+    ],
+    "legal_professional": [
+        "Legal Professional",
+        "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c",
+        "Humanities",
+    ],
+    "high_school_chinese": [
+        "High School Chinese",
+        "\u9ad8\u4e2d\u8bed\u6587",
+        "Humanities",
+    ],
+    "high_school_history": [
+        "High School History",
+        "\u9ad8\u4e2d\u5386\u53f2",
+        "Humanities",
+    ],
+    "middle_school_history": [
+        "Middle School History",
+        "\u521d\u4e2d\u5386\u53f2",
+        "Humanities",
+    ],
    "civil_servant": ["Civil Servant", "\u516c\u52a1\u5458", "Other"],
    "sports_science": ["Sports Science", "\u4f53\u80b2\u5b66", "Other"],
    "plant_protection": ["Plant Protection", "\u690d\u7269\u4fdd\u62a4", "Other"],
    "basic_medicine": ["Basic Medicine", "\u57fa\u7840\u533b\u5b66", "Other"],
    "clinical_medicine": ["Clinical Medicine", "\u4e34\u5e8a\u533b\u5b66", "Other"],
-    "urban_and_rural_planner": ["Urban and Rural Planner", "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08", "Other"],
+    "urban_and_rural_planner": [
+        "Urban and Rural Planner",
+        "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08",
+        "Other",
+    ],
    "accountant": ["Accountant", "\u6ce8\u518c\u4f1a\u8ba1\u5e08", "Other"],
-    "fire_engineer": ["Fire Engineer", "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08", "Other"],
-    "environmental_impact_assessment_engineer": ["Environmental Impact Assessment Engineer", "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08", "Other"],
+    "fire_engineer": [
+        "Fire Engineer",
+        "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08",
+        "Other",
+    ],
+    "environmental_impact_assessment_engineer": [
+        "Environmental Impact Assessment Engineer",
+        "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08",
+        "Other",
+    ],
    "tax_accountant": ["Tax Accountant", "\u7a0e\u52a1\u5e08", "Other"],
-    "physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"]
+    "physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"],
 }
-hard_list = ['advanced_mathematics', 'discrete_mathematics', 'probability_and_statistics', 'college_physics', 'college_chemistry', 'high_school_mathematics', 'high_school_physics', 'high_school_chemistry']
+hard_list = [
+    "advanced_mathematics",
+    "discrete_mathematics",
+    "probability_and_statistics",
+    "college_physics",
+    "college_chemistry",
+    "high_school_mathematics",
+    "high_school_physics",
+    "high_school_chemistry",
+]
 choices = ["A", "B", "C", "D"]


 def main(args):
    model, tokenizer = load_models_tokenizer(args)
-    
+
    dev_result = {}
    for subject_name in tqdm(TASK_NAME_MAPPING.keys()):
-        val_file_path = os.path.join(args.eval_data_path, 'val', f'{subject_name}_val.csv')
-        dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}_dev.csv')
+        val_file_path = os.path.join(
+            args.eval_data_path, "val", f"{subject_name}_val.csv"
+        )
+        dev_file_path = os.path.join(
+            args.eval_data_path, "dev", f"{subject_name}_dev.csv"
+        )
        # test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}_test.csv')
        val_df = pd.read_csv(val_file_path)
        dev_df = pd.read_csv(dev_file_path)
        # test_df = pd.read_csv(test_file_path)

-        score = eval_subject(model, tokenizer, subject_name, val_df, dev_df=dev_df, k=5, few_shot=True,
-                             save_result_dir=f"outs/ceval_eval_result")
+        score = eval_subject(
+            model,
+            tokenizer,
+            subject_name,
+            val_df,
+            dev_df=dev_df,
+            k=5,
+            few_shot=True,
+            save_result_dir=f"outs/ceval_eval_result",
+        )
        dev_result[subject_name] = score
    cal_ceval(dev_result)


-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Test HF checkpoint.')
-    parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B")
-    parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed')
-
-    """Provide extra arguments required for tasks."""
-    group = parser.add_argument_group(title='Evaluation options')
-    group.add_argument('-d', '--eval_data_path', type=str, required=True,
-                       help='Path to eval data')
-    group.add_argument("--max-seq-len", type=int, default=2048,
-                       help='Size of the output generated text.')
-    group.add_argument("--debug", action='store_true', default=False,
-                       help='Print infos.')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test HF checkpoint.")
+    parser.add_argument(
+        "-c",
+        "--checkpoint-path",
+        type=str,
+        help="Checkpoint path",
+        default="Qwen/Qwen-7B",
+    )
+    parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed")
+
+    # Provide extra arguments required for tasks
+    group = parser.add_argument_group(title="Evaluation options")
+    group.add_argument(
+        "-d", "--eval_data_path", type=str, required=True, help="Path to eval data"
+    )
+    group.add_argument(
+        "--max-seq-len",
+        type=int,
+        default=2048,
+        help="Size of the output generated text.",
+    )
+    group.add_argument(
+        "--debug", action="store_true", default=False, help="Print infos."
+    )

    args = parser.parse_args()
    set_seed(args.seed)

-    main(args)
+    main(args)
--- a/eval/evaluate_chat_ceval.py
+++ b/eval/evaluate_chat_ceval.py
@ -1,14 +1,13 @@
 import os
-import pandas as pd
-import numpy as np
 import argparse
-import datasets
-import torch
 import re
+import torch
+import pandas as pd
 from thefuzz import process
-from typing import List
 from tqdm import tqdm
 from transformers.trainer_utils import set_seed
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig

 '''
 wget https://huggingface.co/datasets/ceval/ceval-exam/resolve/main/ceval-exam.zip
@ -22,13 +21,16 @@ python eval/evaluate_chat_ceval.py -d data/ceval
 '''

 def load_models_tokenizer(args):
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-    from transformers.generation import GenerationConfig
-
-    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True, bf16=True, use_flash_attn=True).eval()
-    model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True)
-    model.generation_config.do_sample = False # use greedy decoding
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint_path, device_map="auto", trust_remote_code=True
+    ).eval()
+    model.generation_config = GenerationConfig.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True
+    )
+    model.generation_config.do_sample = False  # use greedy decoding
    return model, tokenizer

 def process_before_extraction(gen, question, choice_dict):
@ -57,20 +59,28 @@ def process_before_extraction(gen, question, choice_dict):
        gen = gen.replace(val.rstrip("。"), key)
    return gen

+
 def count_substr(gen, pattern):
    return len(re.findall(pattern, gen))

+
 def extract_choice(gen, prompt, choice_list):
    # 答案是A | 选项是A | 应该选A选项
-    res = re.search(r"(?:(?:选|选择|选定)|(?:(?:答案|选项)(?![^ABCD]{0,10}?(?:不|非)[^ABCD]{0,10}?(?:是|为|：|:|】))[^ABCD]{0,10}?(?:是|为|：|:|】))[^ABCD]{0,10}?)(A|B|C|D)(?:选项)?(?:\)|。|\.|，|,|．|、|A|B|C|D|$)", gen)
-        
+    res = re.search(
+        r"(?:(?:选|选择|选定)[：:]?\s*|(?:(?:答案|选项)(?![^ABCD]{0,10}?(?:不|非)[^ABCD]{0,10}?(?:是|选|为|：|:|】))[^ABCD]{0,10}?(?:是|选|为|：|:|】))[^ABCD]{0,10}?)(A|B|C|D)(?:选项)?(?:\)|。|\.|，|,|．|、|A|B|C|D|$|：|:|\)|）)",
+        gen,
+    )
+
    # A选项正确 | A选项符合题意
    if res is None:
-        res = re.search(r"(A|B|C|D)(?:选?项)?(?![^ABCD]{0,4}?(?:不|非)[^ABCD]{0,4}?(?:正确|对|符合))[^ABCD]{0,4}?(?:正确|对|符合)", gen)        
+        res = re.search(
+            r"(A|B|C|D)(?:选?项)?(?![^ABCD]{0,4}?(?:不|非)[^ABCD]{0,4}?(?:正确|对[的，。：]|符合))[^ABCD]{0,4}?(?:正确|对[的，。：]|符合)",
+            gen,
+        )

    # 直接输出 A
    if res is None:
-        res = re.search(r"^(A|B|C|D)(?:。|\.|，|,|．|$)", gen)
+        res = re.search(r"^[\(（]?(A|B|C|D)(?:。|\)|）|\.|，|,|．|：|:|$)", gen)

    # 获取第一个出现的字母
    if res is None:
@ -78,41 +88,46 @@ def extract_choice(gen, prompt, choice_list):

    if res is None:
        return choices[choice_list.index(process.extractOne(gen, choice_list)[0])]
-    else:
-        return res.group(1)
+    return res.group(1)
+

 def format_example(line):
-    example = line['question'] + "\n\n"
+    example = line["question"] + "\n\n"
    for choice in choices:
-        example += f'{choice}. {line[f"{choice}"]}\n' 
+        example += f'{choice}. {line[f"{choice}"]}\n'
    return example

+
 def extract_answer(response, row):
-    prompt = row['question']
-    gen = process_before_extraction(response, prompt, {choice: row[choice] for choice in choices})
+    prompt = row["question"]
+    gen = process_before_extraction(
+        response, prompt, {choice: row[choice] for choice in choices}
+    )
    if not isinstance(prompt, str):
        prompt = prompt[0]
    pred = extract_choice(gen, prompt, [row[choice] for choice in choices])
    return pred

+
@torch.no_grad()
 def eval_subject(
-        model,
-        tokenizer,
-        subject_name,
-        test_df,
-        save_result_dir=None,
-        overwrite=False,
-        **kwargs
+    model,
+    tokenizer,
+    subject_name,
+    test_df,
+    save_result_dir=None,
+    overwrite=False,
+    **kwargs
 ):
-
-    result_path = os.path.join(save_result_dir, f'{subject_name}_result.csv')
+    result_path = os.path.join(save_result_dir, f"{subject_name}_result.csv")
    if not overwrite and os.path.exists(result_path):
        print(f"{result_path} existed, skip!")
        score = []
-        for (_, datarow), (_, resultrow) in zip(test_df.iterrows(), pd.read_csv(result_path).iterrows()):
-            pred = extract_answer(resultrow['model_response'], datarow)
-            correct = 1 if pred == datarow['answer'] else 0
+        for (_, datarow), (_, resultrow) in zip(
+            test_df.iterrows(), pd.read_csv(result_path).iterrows()
+        ):
+            pred = extract_answer(resultrow["model_response"], datarow)
+            correct = 1 if pred == datarow["answer"] else 0
            score.append(correct)
        correct_ratio = 100 * sum(score) / len(score)
        return correct_ratio
@ -124,7 +139,7 @@ def eval_subject(
    for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
        question = format_example(row)

-        response, history = model.chat(
+        response, _ = model.chat(
            tokenizer,
            question,
            history=None,
@ -134,22 +149,24 @@ def eval_subject(
        pred = extract_answer(response, row)
        print(pred)
        print("======================")
-        
-        if 'answer' in row:
-            correct = 1 if pred == row['answer'] else 0
+
+        if "answer" in row:
+            correct = 1 if pred == row["answer"] else 0
            score.append(correct)
-            if args.debug: print(f'{question} pred: {pred} ref: {row["answer"]}')
+            if args.debug:
+                print(f'{question} pred: {pred} ref: {row["answer"]}')
        responses.append(response)
        result.append(pred)

    if score:
        correct_ratio = 100 * sum(score) / len(score)
-        if args.debug: print(subject_name, correct_ratio)
+        if args.debug:
+            print(subject_name, correct_ratio)
    else:
        correct_ratio = 0
    if save_result_dir:
-        test_df['model_response'] = responses
-        test_df['model_output'] = result
+        test_df["model_response"] = responses
+        test_df["model_output"] = result
        if score:
            test_df["correctness"] = score
        os.makedirs(save_result_dir, exist_ok=True)
@ -162,89 +179,225 @@ def cal_ceval(res):
    acc_sum_dict = dict()
    acc_norm_sum_dict = dict()
    cnt_dict = dict()
-    acc_sum = 0.
+    acc_sum = 0.0
    cnt = 0
    hard_cnt = 0
-    hard_acc_sum = 0.
+    hard_acc_sum = 0.0
    for tt in res.keys():
-        name = tt.split('-')[-1]
+        name = tt.split("-")[-1]
        acc_sum += float(res[tt])
        cnt += 1
        class_ = TASK_NAME_MAPPING[name][2]
        if class_ not in acc_sum_dict:
-            acc_sum_dict[class_] = 0.
-            acc_norm_sum_dict[class_] = 0.
-            cnt_dict[class_] = 0.
+            acc_sum_dict[class_] = 0.0
+            acc_norm_sum_dict[class_] = 0.0
+            cnt_dict[class_] = 0.0
        if name in hard_list:
            hard_cnt += 1
            hard_acc_sum += float(res[tt])
        acc_sum_dict[class_] += float(res[tt])
        cnt_dict[class_] += 1
-    print('\n\n\n')
-    for k in ['STEM', 'Social Science', 'Humanities', 'Other']:
+    print("\n\n\n")
+    for k in ["STEM", "Social Science", "Humanities", "Other"]:
        if k in cnt_dict:
-            print('%s acc: %.2f ' % (
-                k, acc_sum_dict[k] / cnt_dict[k]))
+            print("%s acc: %.2f " % (k, acc_sum_dict[k] / cnt_dict[k]))
    if hard_cnt > 0:
-        print('Hard acc:%.2f ' % (hard_acc_sum / hard_cnt))
-    print('AVERAGE acc:%.2f ' % (acc_sum / cnt))
+        print("Hard acc:%.2f " % (hard_acc_sum / hard_cnt))
+    print("AVERAGE acc:%.2f " % (acc_sum / cnt))


 TASK_NAME_MAPPING = {
    "computer_network": ["Computer Network", "\u8ba1\u7b97\u673a\u7f51\u7edc", "STEM"],
    "operating_system": ["Operating System", "\u64cd\u4f5c\u7cfb\u7edf", "STEM"],
-    "computer_architecture": ["Computer Architecture", "\u8ba1\u7b97\u673a\u7ec4\u6210", "STEM"],
+    "computer_architecture": [
+        "Computer Architecture",
+        "\u8ba1\u7b97\u673a\u7ec4\u6210",
+        "STEM",
+    ],
    "college_programming": ["College Programming", "\u5927\u5b66\u7f16\u7a0b", "STEM"],
    "college_physics": ["College Physics", "\u5927\u5b66\u7269\u7406", "STEM"],
    "college_chemistry": ["College Chemistry", "\u5927\u5b66\u5316\u5b66", "STEM"],
-    "advanced_mathematics": ["Advanced Mathematics", "\u9ad8\u7b49\u6570\u5b66", "STEM"],
-    "probability_and_statistics": ["Probability and Statistics", "\u6982\u7387\u7edf\u8ba1", "STEM"],
-    "discrete_mathematics": ["Discrete Mathematics", "\u79bb\u6563\u6570\u5b66", "STEM"],
-    "electrical_engineer": ["Electrical Engineer", "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08", "STEM"],
-    "metrology_engineer": ["Metrology Engineer", "\u6ce8\u518c\u8ba1\u91cf\u5e08", "STEM"],
-    "high_school_mathematics": ["High School Mathematics", "\u9ad8\u4e2d\u6570\u5b66", "STEM"],
+    "advanced_mathematics": [
+        "Advanced Mathematics",
+        "\u9ad8\u7b49\u6570\u5b66",
+        "STEM",
+    ],
+    "probability_and_statistics": [
+        "Probability and Statistics",
+        "\u6982\u7387\u7edf\u8ba1",
+        "STEM",
+    ],
+    "discrete_mathematics": [
+        "Discrete Mathematics",
+        "\u79bb\u6563\u6570\u5b66",
+        "STEM",
+    ],
+    "electrical_engineer": [
+        "Electrical Engineer",
+        "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08",
+        "STEM",
+    ],
+    "metrology_engineer": [
+        "Metrology Engineer",
+        "\u6ce8\u518c\u8ba1\u91cf\u5e08",
+        "STEM",
+    ],
+    "high_school_mathematics": [
+        "High School Mathematics",
+        "\u9ad8\u4e2d\u6570\u5b66",
+        "STEM",
+    ],
    "high_school_physics": ["High School Physics", "\u9ad8\u4e2d\u7269\u7406", "STEM"],
-    "high_school_chemistry": ["High School Chemistry", "\u9ad8\u4e2d\u5316\u5b66", "STEM"],
+    "high_school_chemistry": [
+        "High School Chemistry",
+        "\u9ad8\u4e2d\u5316\u5b66",
+        "STEM",
+    ],
    "high_school_biology": ["High School Biology", "\u9ad8\u4e2d\u751f\u7269", "STEM"],
-    "middle_school_mathematics": ["Middle School Mathematics", "\u521d\u4e2d\u6570\u5b66", "STEM"],
-    "middle_school_biology": ["Middle School Biology", "\u521d\u4e2d\u751f\u7269", "STEM"],
-    "middle_school_physics": ["Middle School Physics", "\u521d\u4e2d\u7269\u7406", "STEM"],
-    "middle_school_chemistry": ["Middle School Chemistry", "\u521d\u4e2d\u5316\u5b66", "STEM"],
+    "middle_school_mathematics": [
+        "Middle School Mathematics",
+        "\u521d\u4e2d\u6570\u5b66",
+        "STEM",
+    ],
+    "middle_school_biology": [
+        "Middle School Biology",
+        "\u521d\u4e2d\u751f\u7269",
+        "STEM",
+    ],
+    "middle_school_physics": [
+        "Middle School Physics",
+        "\u521d\u4e2d\u7269\u7406",
+        "STEM",
+    ],
+    "middle_school_chemistry": [
+        "Middle School Chemistry",
+        "\u521d\u4e2d\u5316\u5b66",
+        "STEM",
+    ],
    "veterinary_medicine": ["Veterinary Medicine", "\u517d\u533b\u5b66", "STEM"],
-    "college_economics": ["College Economics", "\u5927\u5b66\u7ecf\u6d4e\u5b66", "Social Science"],
-    "business_administration": ["Business Administration", "\u5de5\u5546\u7ba1\u7406", "Social Science"],
-    "marxism": ["Marxism", "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406", "Social Science"],
-    "mao_zedong_thought": ["Mao Zedong Thought", "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba", "Social Science"],
+    "college_economics": [
+        "College Economics",
+        "\u5927\u5b66\u7ecf\u6d4e\u5b66",
+        "Social Science",
+    ],
+    "business_administration": [
+        "Business Administration",
+        "\u5de5\u5546\u7ba1\u7406",
+        "Social Science",
+    ],
+    "marxism": [
+        "Marxism",
+        "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406",
+        "Social Science",
+    ],
+    "mao_zedong_thought": [
+        "Mao Zedong Thought",
+        "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba",
+        "Social Science",
+    ],
    "education_science": ["Education Science", "\u6559\u80b2\u5b66", "Social Science"],
-    "teacher_qualification": ["Teacher Qualification", "\u6559\u5e08\u8d44\u683c", "Social Science"],
-    "high_school_politics": ["High School Politics", "\u9ad8\u4e2d\u653f\u6cbb", "Social Science"],
-    "high_school_geography": ["High School Geography", "\u9ad8\u4e2d\u5730\u7406", "Social Science"],
-    "middle_school_politics": ["Middle School Politics", "\u521d\u4e2d\u653f\u6cbb", "Social Science"],
-    "middle_school_geography": ["Middle School Geography", "\u521d\u4e2d\u5730\u7406", "Social Science"],
-    "modern_chinese_history": ["Modern Chinese History", "\u8fd1\u4ee3\u53f2\u7eb2\u8981", "Humanities"],
-    "ideological_and_moral_cultivation": ["Ideological and Moral Cultivation", "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840", "Humanities"],
+    "teacher_qualification": [
+        "Teacher Qualification",
+        "\u6559\u5e08\u8d44\u683c",
+        "Social Science",
+    ],
+    "high_school_politics": [
+        "High School Politics",
+        "\u9ad8\u4e2d\u653f\u6cbb",
+        "Social Science",
+    ],
+    "high_school_geography": [
+        "High School Geography",
+        "\u9ad8\u4e2d\u5730\u7406",
+        "Social Science",
+    ],
+    "middle_school_politics": [
+        "Middle School Politics",
+        "\u521d\u4e2d\u653f\u6cbb",
+        "Social Science",
+    ],
+    "middle_school_geography": [
+        "Middle School Geography",
+        "\u521d\u4e2d\u5730\u7406",
+        "Social Science",
+    ],
+    "modern_chinese_history": [
+        "Modern Chinese History",
+        "\u8fd1\u4ee3\u53f2\u7eb2\u8981",
+        "Humanities",
+    ],
+    "ideological_and_moral_cultivation": [
+        "Ideological and Moral Cultivation",
+        "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840",
+        "Humanities",
+    ],
    "logic": ["Logic", "\u903b\u8f91\u5b66", "Humanities"],
    "law": ["Law", "\u6cd5\u5b66", "Humanities"],
-    "chinese_language_and_literature": ["Chinese Language and Literature", "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66", "Humanities"],
+    "chinese_language_and_literature": [
+        "Chinese Language and Literature",
+        "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66",
+        "Humanities",
+    ],
    "art_studies": ["Art Studies", "\u827a\u672f\u5b66", "Humanities"],
-    "professional_tour_guide": ["Professional Tour Guide", "\u5bfc\u6e38\u8d44\u683c", "Humanities"],
-    "legal_professional": ["Legal Professional", "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c", "Humanities"],
-    "high_school_chinese": ["High School Chinese", "\u9ad8\u4e2d\u8bed\u6587", "Humanities"],
-    "high_school_history": ["High School History", "\u9ad8\u4e2d\u5386\u53f2", "Humanities"],
-    "middle_school_history": ["Middle School History", "\u521d\u4e2d\u5386\u53f2", "Humanities"],
+    "professional_tour_guide": [
+        "Professional Tour Guide",
+        "\u5bfc\u6e38\u8d44\u683c",
+        "Humanities",
+    ],
+    "legal_professional": [
+        "Legal Professional",
+        "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c",
+        "Humanities",
+    ],
+    "high_school_chinese": [
+        "High School Chinese",
+        "\u9ad8\u4e2d\u8bed\u6587",
+        "Humanities",
+    ],
+    "high_school_history": [
+        "High School History",
+        "\u9ad8\u4e2d\u5386\u53f2",
+        "Humanities",
+    ],
+    "middle_school_history": [
+        "Middle School History",
+        "\u521d\u4e2d\u5386\u53f2",
+        "Humanities",
+    ],
    "civil_servant": ["Civil Servant", "\u516c\u52a1\u5458", "Other"],
    "sports_science": ["Sports Science", "\u4f53\u80b2\u5b66", "Other"],
    "plant_protection": ["Plant Protection", "\u690d\u7269\u4fdd\u62a4", "Other"],
    "basic_medicine": ["Basic Medicine", "\u57fa\u7840\u533b\u5b66", "Other"],
    "clinical_medicine": ["Clinical Medicine", "\u4e34\u5e8a\u533b\u5b66", "Other"],
-    "urban_and_rural_planner": ["Urban and Rural Planner", "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08", "Other"],
+    "urban_and_rural_planner": [
+        "Urban and Rural Planner",
+        "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08",
+        "Other",
+    ],
    "accountant": ["Accountant", "\u6ce8\u518c\u4f1a\u8ba1\u5e08", "Other"],
-    "fire_engineer": ["Fire Engineer", "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08", "Other"],
-    "environmental_impact_assessment_engineer": ["Environmental Impact Assessment Engineer", "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08", "Other"],
+    "fire_engineer": [
+        "Fire Engineer",
+        "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08",
+        "Other",
+    ],
+    "environmental_impact_assessment_engineer": [
+        "Environmental Impact Assessment Engineer",
+        "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08",
+        "Other",
+    ],
    "tax_accountant": ["Tax Accountant", "\u7a0e\u52a1\u5e08", "Other"],
-    "physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"]
+    "physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"],
 }
-hard_list = ['advanced_mathematics', 'discrete_mathematics', 'probability_and_statistics', 'college_physics', 'college_chemistry', 'high_school_mathematics', 'high_school_physics', 'high_school_chemistry']
+hard_list = [
+    "advanced_mathematics",
+    "discrete_mathematics",
+    "probability_and_statistics",
+    "college_physics",
+    "college_chemistry",
+    "high_school_mathematics",
+    "high_school_physics",
+    "high_school_chemistry",
+]
 choices = ["A", "B", "C", "D"]


@ -257,34 +410,50 @@ def main(args):
    print("model loaded")
    dev_result = {}
    for subject_name in tqdm(TASK_NAME_MAPPING.keys()):
-        val_file_path = os.path.join(args.eval_data_path, 'val', f'{subject_name}_val.csv')
-        # dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}_dev.csv')
-        # test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}_test.csv')
+        val_file_path = os.path.join(
+            args.eval_data_path, "val", f"{subject_name}_val.csv"
+        )
        val_df = pd.read_csv(val_file_path)
-        # dev_df = pd.read_csv(dev_file_path)
-        # test_df = pd.read_csv(test_file_path) 

-        score = eval_subject(model, tokenizer, subject_name, val_df,
-                             save_result_dir=f"outs_chat/ceval_eval_result", overwrite=args.overwrite)
+        score = eval_subject(
+            model,
+            tokenizer,
+            subject_name,
+            val_df,
+            save_result_dir="outs_chat/ceval_eval_result",
+            overwrite=args.overwrite,
+        )
        dev_result[subject_name] = score
    cal_ceval(dev_result)


-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Test HF checkpoint.')
-    parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B-Chat")
-    parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed')
-
-    """Provide extra arguments required for tasks."""
-    group = parser.add_argument_group(title='Evaluation options')
-    group.add_argument('-d', '--eval_data_path', type=str, required=True,
-                       help='Path to eval data')
-    group.add_argument("--debug", action='store_true', default=False,
-                       help='Print infos.')
-    group.add_argument("--overwrite", action='store_true', default=False,
-                       help='Overwrite existed results')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test HF checkpoint.")
+    parser.add_argument(
+        "-c",
+        "--checkpoint-path",
+        type=str,
+        help="Checkpoint path",
+        default="Qwen/Qwen-7B-Chat",
+    )
+    parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed")
+
+    # Provide extra arguments required for tasks
+    group = parser.add_argument_group(title="Evaluation options")
+    group.add_argument(
+        "-d", "--eval_data_path", type=str, required=True, help="Path to eval data"
+    )
+    group.add_argument(
+        "--debug", action="store_true", default=False, help="Print infos."
+    )
+    group.add_argument(
+        "--overwrite",
+        action="store_true",
+        default=False,
+        help="Overwrite existed results",
+    )

    args = parser.parse_args()
    set_seed(args.seed)

-    main(args)
+    main(args)
--- a/eval/evaluate_chat_gsm8k.py
+++ b/eval/evaluate_chat_gsm8k.py
@ -1,15 +1,10 @@
-import random
-import tqdm
-import os
-import re
-import sys
-import torch
-import numpy as np
-import jsonlines
-import argparse
 import json
+import re
 from pathlib import Path
-from datasets import load_from_disk,load_dataset
+import argparse
+import numpy as np
+import tqdm
+from datasets import load_from_disk, load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationConfig

@ -18,39 +13,41 @@ python eval/evaluate_chat_gsm8k.py [--use-fewshot]
 '''

 INVALID_ANS = "[invalid]"
-DEVICE  = "cuda:0"
+DEVICE = "cuda:0"

 def doc_to_text(doc, use_fewshot):
    if use_fewshot:
-        context = "Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\n" \
-                "Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n\n" \
-                "Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws.  Their opponents score double the 2 pointers but half the 3 pointers and free throws.  What's the total number of points scored by both teams added together?\nLet's think step by step\n" \
-                "Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\nHis team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\nThey scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\nAll together his team scored 50+24+10= 84 points\nMark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\nHis opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\nThey also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\nAll together Mark's opponents scored 100+12+5=117 points\nThe total score for the game is both team's scores added together, so it is 84+117=201 points\nThe answer is 201\n\n" \
-                "Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nLet's think step by step\n" \
-                "When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n\n" \
-                "Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\n" \
-                "For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n\n" \
-                f"Question: {doc['question']}\nLet's think step by step"
+        context = (
+            "Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\n"
+            "Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n\n"
+            "Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws.  Their opponents score double the 2 pointers but half the 3 pointers and free throws.  What's the total number of points scored by both teams added together?\nLet's think step by step\n"
+            "Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\nHis team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\nThey scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\nAll together his team scored 50+24+10= 84 points\nMark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\nHis opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\nThey also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\nAll together Mark's opponents scored 100+12+5=117 points\nThe total score for the game is both team's scores added together, so it is 84+117=201 points\nThe answer is 201\n\n"
+            "Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nLet's think step by step\n"
+            "When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n\n"
+            "Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\n"
+            "For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n\n"
+            f"Question: {doc['question']}\nLet's think step by step"
+        )
    else:
-        context = doc['question']
+        context = doc["question"]
    return context

+
 def decode(tokens_list, tokenizer, raw_text_len):
    sents = []
-    # print(len(tokens_list))
    for tokens in tokens_list:
        tokens = tokens.cpu().numpy().tolist()
-        sent = tokenizer.tokenizer.decode(
-            tokens[raw_text_len:])
-        sent = sent.split('<|endoftext|>')[0]
-        sent = sent.split('\n\n\n')[0]
+        sent = tokenizer.tokenizer.decode(tokens[raw_text_len:])
+        sent = sent.split("<|endoftext|>")[0]
+        sent = sent.split("\n\n\n")[0]
        sent = sent.split("\n\n")[0]
        sent = sent.split("Question:")[0]
        sents.append(sent)
    return sents

+
 def generate_sample(model, tokenizer, question):
-    response, history = model.chat(
+    response, _ = model.chat(
        tokenizer,
        question,
        history=None,
@ -64,7 +61,9 @@ def generate_sample(model, tokenizer, question):

 def extract_answer_hf(completion):
    def _get_last_digit(s):
-        _PAT_LAST_DIGIT = re.compile(r"(?<=(\s|[\$%#{]))([+-])?(?=(\S))(0|([1-9](\d*|\d{0,2}(,\d{3})*)))?(\.\d*[1-9])?(?=(\s|[.,}]|$))")
+        _PAT_LAST_DIGIT = re.compile(
+            r"(?<=(\s|[\$%#{]))([+-])?(?=(\S))(0|([1-9](\d*|\d{0,2}(,\d{3})*)))?(\.\d*[1-9])?(?=(\s|[.,}]|$))"
+        )
        match = list(_PAT_LAST_DIGIT.finditer(s))
        if match:
            last_digit = match[-1].group().replace(",", "").replace("+", "")
@ -74,51 +73,66 @@ def extract_answer_hf(completion):
            print(f"No digits found in {s!r}")
        return last_digit

-    job_gen = completion.strip('.').replace('\n', '\\n')
+    job_gen = completion.strip(".").replace("\n", "\\n")
    last_digit = _get_last_digit(job_gen)
    if last_digit is not None:
        return eval(last_digit)
-    else:
-        return INVALID_ANS
+    return INVALID_ANS
+

 def extract_answer(completion):
    try:
-        last_number = re.findall(r'\d+', completion)[-1]
+        last_number = re.findall(r"\d+", completion)[-1]
        return eval(last_number)
    except:
        return INVALID_ANS

-def is_correct( completion, answer):
+
+def is_correct(completion, answer):
    gold = extract_answer(answer)
    assert gold != INVALID_ANS, "No ground truth answer found in the document."
    return extract_answer(completion) == gold

-if __name__ == '__main__':

-    parser = argparse.ArgumentParser(description='Test HF checkpoint.')
-    parser.add_argument("-c", "--checkpoint-path", type=Path, help="Checkpoint path", default="Qwen/Qwen-7B-Chat")
-    parser.add_argument("-f","--sample-input-file", type=str, default=None)
-    parser.add_argument("-o","--sample-output-file", type=str, default="gsm8k_res.jsonl")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test HF checkpoint.")
+    parser.add_argument(
+        "-c",
+        "--checkpoint-path",
+        type=Path,
+        help="Checkpoint path",
+        default="Qwen/Qwen-7B-Chat",
+    )
+    parser.add_argument("-f", "--sample-input-file", type=str, default=None)
+    parser.add_argument(
+        "-o", "--sample-output-file", type=str, default="gsm8k_res.jsonl"
+    )
    parser.add_argument("--use-fewshot", action="store_true")

    args = parser.parse_args()

    if args.sample_input_file is not None:
-        dataset = load_from_disk(args.sample_input_file)# or: 
+        dataset = load_from_disk(args.sample_input_file)  # or:
    else:
        dataset = load_dataset("gsm8k", "main")

-    print('Loading tokenizer ...')
-    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True, bf16=True, use_flash_attn=True)
+    print("Loading tokenizer ...")
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True, bf16=True, use_flash_attn=True
+    )

-    print('Loading model ...')
-    model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval()
-    model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True)
-    model.generation_config.do_sample = False # use greedy decoding
+    print("Loading model ...")
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint_path, device_map="auto", trust_remote_code=True
+    ).eval()
+    model.generation_config = GenerationConfig.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True
+    )
+    model.generation_config.do_sample = False  # use greedy decoding

    test = dataset["test"]

-    f_output = open(args.sample_output_file, 'w', encoding='utf-8')
+    f_output = open(args.sample_output_file, "w", encoding="utf-8")
    tot_length = test.num_rows
    acc_res = []
    for doc in tqdm.tqdm(test):
@ -132,6 +146,6 @@ if __name__ == '__main__':
        f_output.write(json.dumps(doc, ensure_ascii=False) + "\n")
        f_output.flush()
        acc_res.append(acc)
-    
+
    f_output.close()
    print("4-shot Acc: " if args.use_fewshot else "Zero-shot Acc", np.mean(acc_res))
--- a/eval/evaluate_chat_humaneval.py
+++ b/eval/evaluate_chat_humaneval.py
@ -1,14 +1,10 @@
-import random
-import tqdm
-import os
-import sys
-import torch
-import jsonlines
-import argparse
-import jsonlines
-from pathlib import Path
+
 import re
 import textwrap
+import argparse
+from pathlib import Path
+import tqdm
+import jsonlines
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationConfig

@ -24,25 +20,31 @@ evaluate_functional_correctness HumanEval_res.jsonl
 DEVICE = "cuda:0"

 def extract_code(text, entry_point):
-
    # 正则表达式匹配代码块
-    code_block_pattern = re.compile(rf"```(?:[Pp]ython\n)?.*?def\s+{entry_point}.*?:\n(.*?)\n```", re.DOTALL)
+    code_block_pattern = re.compile(
+        rf"```(?:[Pp]ython\n)?.*?def\s+{entry_point}.*?:\n(.*?)\n```", re.DOTALL
+    )
    code_block = code_block_pattern.search(text)
    if code_block is None:
-        code_block_pattern = re.compile(rf"def\s+{entry_point}.*?:\n(.*?)(?:\n(?!\n*(?:  |\t))|$)", re.DOTALL)
+        code_block_pattern = re.compile(
+            rf"def\s+{entry_point}.*?:\n(.*?)(?:\n(?!\n*(?:  |\t))|$)", re.DOTALL
+        )
        code_block = code_block_pattern.search(text)
    if code_block is None:
-        code_block_pattern = re.compile(rf"def.*?:\n(.*?)(?:\n(?!\n*(?:  |\t))|$)", re.DOTALL)
+        code_block_pattern = re.compile(
+            r"def.*?:\n(.*?)(?:\n(?!\n*(?:  |\t))|$)", re.DOTALL
+        )
        code_block = code_block_pattern.search(text)

    if code_block is not None:
        return code_block.group(1)
-    else:
-        # if no code block is found, assume the LM is simply filling the code
-        return textwrap.indent(text, ' ' * 4)
+
+    # if no code block is found, assume the LM is simply filling the code
+    return textwrap.indent(text, " " * 4)
+

 def generate_sample(model, tokenizer, question, entry_point):
-    response, history = model.chat(
+    response, _ = model.chat(
        tokenizer,
        question,
        history=None,
@ -52,31 +54,56 @@ def generate_sample(model, tokenizer, question, entry_point):
    answer = extract_code(response, entry_point)
    return answer, response

-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser(description='Test HF checkpoint.')
-    parser.add_argument("-c", "--checkpoint-path", type=Path, help='Checkpoint path', default="Qwen/Qwen-7B-Chat")
-    parser.add_argument("-f","--sample-input-file", type=str, default=None, help="data path to HumanEval.jsonl")
-    parser.add_argument("-o","--sample-output-file", type=str, default="HumanEval_res.jsonl")

+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test HF checkpoint.")
+    parser.add_argument(
+        "-c",
+        "--checkpoint-path",
+        type=Path,
+        help="Checkpoint path",
+        default="Qwen/Qwen-7B-Chat",
+    )
+    parser.add_argument(
+        "-f",
+        "--sample-input-file",
+        type=str,
+        default=None,
+        help="data path to HumanEval.jsonl",
+    )
+    parser.add_argument(
+        "-o", "--sample-output-file", type=str, default="HumanEval_res.jsonl"
+    )

    args = parser.parse_args()
-    print('Loading tokenizer ...')
-    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True)
+    print("Loading tokenizer ...")
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True
+    )

-    print('Loading model ...')
-    model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True, bf16=True, use_flash_attn=True).eval()
-    model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True)
-    model.generation_config.do_sample = False # use greedy decoding
+    print("Loading model ...")
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint_path,
+        device_map="auto",
+        trust_remote_code=True,
+        bf16=True,
+        use_flash_attn=True,
+    ).eval()
+    model.generation_config = GenerationConfig.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True
+    )
+    model.generation_config.do_sample = False  # use greedy decoding

-    f_output = jsonlines.Writer(open(args.sample_output_file, 'w', encoding='utf-8'))
+    f_output = jsonlines.Writer(open(args.sample_output_file, "w", encoding="utf-8"))

    f = jsonlines.open(args.sample_input_file)
    with f_output as output:
-        for jobj in tqdm.tqdm(f, desc='task_idx'):
-            prompt = "Help me fill the following code.\n" + jobj['prompt']
-            task_id = jobj['task_id']
-            answer, response = generate_sample(model, tokenizer, prompt, jobj['entry_point'])
-            gen_jobjs = {'task_id': task_id, "completion": answer, 'response': response} 
+        for jobj in tqdm.tqdm(f, desc="task_idx"):
+            prompt = "Help me fill the following code.\n" + jobj["prompt"]
+            task_id = jobj["task_id"]
+            answer, response = generate_sample(
+                model, tokenizer, prompt, jobj["entry_point"]
+            )
+            gen_jobjs = {"task_id": task_id, "completion": answer, "response": response}
            output.write(gen_jobjs)
    f_output.close()
--- a/eval/evaluate_chat_mmlu.py
+++ b/eval/evaluate_chat_mmlu.py
@ -1,14 +1,13 @@
 import os
-import pandas as pd
-import numpy as np
 import argparse
-import datasets
-import torch
 import re
-from thefuzz import process
-from typing import List
+import torch
+import pandas as pd
 from tqdm import tqdm
+from thefuzz import process
 from transformers.trainer_utils import set_seed
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig

 '''
 wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
@ -22,18 +21,29 @@ python eval/evaluate_chat_mmlu.py -d data/mmlu/data/
 '''

 def load_models_tokenizer(args):
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-    from transformers.generation import GenerationConfig
-
-    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True, bf16=True, use_flash_attn=True).eval()
-    model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True)
-    model.generation_config.do_sample = False # use greedy decoding
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint_path,
+        device_map="auto",
+        trust_remote_code=True,
+        bf16=True,
+        use_flash_attn=True,
+    ).eval()
+    model.generation_config = GenerationConfig.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True
+    )
+    model.generation_config.do_sample = False  # use greedy decoding
    return model, tokenizer


 def format_example(line):
-    example = 'The following is a multiple-choice question. Please choose the most suitable one among A, B, C and D as the answer to this question.\n\n' + line['question'] + "\n"
+    example = (
+        "The following is a multiple-choice question. Please choose the most suitable one among A, B, C and D as the answer to this question.\n\n"
+        + line["question"]
+        + "\n"
+    )
    for choice in choices:
        example += f'{choice}. {line[f"{choice}"]}\n'
    return example
@ -47,13 +57,20 @@ def process_before_extraction(gen, choice_dict):
        gen = pattern.sub(key, gen)
    return gen

+
 def extract_choice(gen, choice_list):
    # answer is A | choice is A | choose A
-    res = re.search(r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^ABCD]{0,20}?(?:n't|not))[^ABCD]{0,10}?\b(?:|is|:|be))\b)[^ABCD]{0,20}?\b(A|B|C|D)\b", gen)
+    res = re.search(
+        r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^ABCD]{0,20}?(?:n't|not))[^ABCD]{0,10}?\b(?:|is|:|be))\b)[^ABCD]{0,20}?\b(A|B|C|D)\b",
+        gen,
+    )

    # A is correct | A is right
    if res is None:
-        res = re.search(r"\b(A|B|C|D)\b(?![^ABCD]{0,8}?(?:n't|not)[^ABCD]{0,5}?(?:correct|right))[^ABCD]{0,10}?\b(?:correct|right)\b", gen)
+        res = re.search(
+            r"\b(A|B|C|D)\b(?![^ABCD]{0,8}?(?:n't|not)[^ABCD]{0,5}?(?:correct|right))[^ABCD]{0,10}?\b(?:correct|right)\b",
+            gen,
+        )

    # straight answer: A
    if res is None:
@ -65,32 +82,37 @@ def extract_choice(gen, choice_list):

    if res is None:
        return choices[choice_list.index(process.extractOne(gen, choice_list)[0])]
-    else:
-        return res.group(1)
+    return res.group(1)
+

 def extract_answer(response, row):
-    gen = process_before_extraction(response, {choice: row[choice] for choice in choices})
+    gen = process_before_extraction(
+        response, {choice: row[choice] for choice in choices}
+    )
    pred = extract_choice(gen, [row[choice] for choice in choices])
    return pred

+
@torch.no_grad()
 def eval_subject(
-        model,
-        tokenizer,
-        subject_name,
-        test_df,
-        save_result_dir=None,
-        overwrite=False,
-        **kwargs
+    model,
+    tokenizer,
+    subject_name,
+    test_df,
+    save_result_dir=None,
+    overwrite=False,
+    **kwargs
 ):
-    result_path = os.path.join(save_result_dir, f'{subject_name}_result.csv')
+    result_path = os.path.join(save_result_dir, f"{subject_name}_result.csv")
    if not overwrite and os.path.exists(result_path):
        print(f"{result_path} existed, skip!")
        score = []
-        for (_, datarow), (_, resultrow) in zip(test_df.iterrows(), pd.read_csv(result_path).iterrows()):
+        for (_, datarow), (_, resultrow) in zip(
+            test_df.iterrows(), pd.read_csv(result_path).iterrows()
+        ):
            # pred = extract_answer(resultrow['model_response'], datarow)
-            pred = resultrow['model_output']
-            correct = 1 if pred == datarow['answer'] else 0
+            pred = resultrow["model_output"]
+            correct = 1 if pred == datarow["answer"] else 0
            score.append(correct)
        return score

@ -100,7 +122,7 @@ def eval_subject(
    for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
        question = format_example(row)

-        response, history = model.chat(
+        response, _ = model.chat(
            tokenizer,
            question,
            history=None,
@ -111,20 +133,24 @@ def eval_subject(
        print(pred)
        print("======================")

-        if 'answer' in row:
-            correct = 1 if pred == row['answer'] else 0
+        if "answer" in row:
+            correct = 1 if pred == row["answer"] else 0
            score.append(correct)
-            if args.debug: print(f'{question} pred: {pred} ref: {row["answer"]}')
+            if args.debug:
+                print(f'{question} pred: {pred} ref: {row["answer"]}')
        result.append(pred)

    if save_result_dir:
-        test_df['model_output'] = result
-        test_df['model_response'] = response
+        test_df["model_output"] = result
+        test_df["model_response"] = response
        if score:
            test_df["correctness"] = score
        os.makedirs(save_result_dir, exist_ok=True)
-        test_df.to_csv(os.path.join(
-            save_result_dir, f'{subject_name}_result.csv'), encoding="utf-8", index=False)
+        test_df.to_csv(
+            os.path.join(save_result_dir, f"{subject_name}_result.csv"),
+            encoding="utf-8",
+            index=False,
+        )

    return score

@ -133,15 +159,13 @@ def cal_mmlu(res):
    acc_sum_dict = dict()
    acc_norm_sum_dict = dict()
    cnt_dict = dict()
-    acc_sum = 0.
+    acc_sum = 0.0
    cnt = 0
-    hard_cnt = 0
-    hard_acc_sum = 0.

    for class_ in TASK_NAME_MAPPING.keys():
-        acc_sum_dict[class_] = 0.
-        acc_norm_sum_dict[class_] = 0.
-        cnt_dict[class_] = 0.
+        acc_sum_dict[class_] = 0.0
+        acc_norm_sum_dict[class_] = 0.0
+        cnt_dict[class_] = 0.0

        for tt in TASK_NAME_MAPPING[class_]:
            acc_sum += sum(res[tt])
@ -150,13 +174,12 @@ def cal_mmlu(res):
            acc_sum_dict[class_] += sum(res[tt])
            cnt_dict[class_] += len(res[tt])

-    print('\n\n\n')
+    print("\n\n\n")
    for k in TASK_NAME_MAPPING.keys():
        if k in cnt_dict:
-            print('%s ACC: %.2f ' % (
-                k, acc_sum_dict[k] * 100 / cnt_dict[k]))
-    print('AVERAGE ACC:%.2f ' % (acc_sum *100 / cnt))
-    
+            print("%s ACC: %.2f " % (k, acc_sum_dict[k] * 100 / cnt_dict[k]))
+    print("AVERAGE ACC:%.2f " % (acc_sum * 100 / cnt))
+

 def main(args):
    print("loading model weights")
@ -170,38 +193,122 @@ def main(args):
    for subject_name in tqdm(SUBJECTS):
        # val_file_path = os.path.join(args.eval_data_path, 'val', f'{subject_name}_val.csv')
        # dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}_dev.csv')
-        test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}_test.csv')
+        test_file_path = os.path.join(
+            args.eval_data_path, "test", f"{subject_name}_test.csv"
+        )
        # val_df = pd.read_csv(val_file_path, names=['question','A','B','C','D','answer'])
        # dev_df = pd.read_csv(dev_file_path, names=['question','A','B','C','D','answer'])
-        test_df = pd.read_csv(test_file_path, names=['question','A','B','C','D','answer'])
+        test_df = pd.read_csv(
+            test_file_path, names=["question", "A", "B", "C", "D", "answer"]
+        )

-        score = eval_subject(model, tokenizer, subject_name, test_df, save_result_dir=f"outs_chat/mmlu_eval_result", overwrite=args.overwrite)
+        score = eval_subject(
+            model,
+            tokenizer,
+            subject_name,
+            test_df,
+            save_result_dir=f"outs_chat/mmlu_eval_result",
+            overwrite=args.overwrite,
+        )
        dev_result[subject_name] = score
    cal_mmlu(dev_result)


-TASK_NAME_MAPPING = {'stem': ['abstract_algebra', 'anatomy', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning'],
- 'Humanities': ['formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions'],
- 'other': ['business_ethics', 'college_medicine', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology', 'global_facts', 'clinical_knowledge'],
- 'social': ['econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy']}
+TASK_NAME_MAPPING = {
+    "stem": [
+        "abstract_algebra",
+        "anatomy",
+        "astronomy",
+        "college_biology",
+        "college_chemistry",
+        "college_computer_science",
+        "college_mathematics",
+        "college_physics",
+        "computer_security",
+        "conceptual_physics",
+        "electrical_engineering",
+        "elementary_mathematics",
+        "high_school_biology",
+        "high_school_chemistry",
+        "high_school_computer_science",
+        "high_school_mathematics",
+        "high_school_physics",
+        "high_school_statistics",
+        "machine_learning",
+    ],
+    "Humanities": [
+        "formal_logic",
+        "high_school_european_history",
+        "high_school_us_history",
+        "high_school_world_history",
+        "international_law",
+        "jurisprudence",
+        "logical_fallacies",
+        "moral_disputes",
+        "moral_scenarios",
+        "philosophy",
+        "prehistory",
+        "professional_law",
+        "world_religions",
+    ],
+    "other": [
+        "business_ethics",
+        "college_medicine",
+        "human_aging",
+        "management",
+        "marketing",
+        "medical_genetics",
+        "miscellaneous",
+        "nutrition",
+        "professional_accounting",
+        "professional_medicine",
+        "virology",
+        "global_facts",
+        "clinical_knowledge",
+    ],
+    "social": [
+        "econometrics",
+        "high_school_geography",
+        "high_school_government_and_politics",
+        "high_school_macroeconomics",
+        "high_school_microeconomics",
+        "high_school_psychology",
+        "human_sexuality",
+        "professional_psychology",
+        "public_relations",
+        "security_studies",
+        "sociology",
+        "us_foreign_policy",
+    ],
+}
 SUBJECTS = [v for vl in TASK_NAME_MAPPING.values() for v in vl]
 choices = ["A", "B", "C", "D"]

-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Test HF checkpoint.')
-    parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B-Chat")
-    parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed')
-
-    """Provide extra arguments required for tasks."""
-    group = parser.add_argument_group(title='Evaluation options')
-    group.add_argument('-d', '--eval_data_path', type=str,
-                       help='Path to eval data')
-    group.add_argument("--debug", action='store_true', default=False,
-                       help='Print infos.')
-    group.add_argument("--overwrite", action='store_true', default=False,
-                       help='Overwrite existed results')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test HF checkpoint.")
+    parser.add_argument(
+        "-c",
+        "--checkpoint-path",
+        type=str,
+        help="Checkpoint path",
+        default="Qwen/Qwen-7B-Chat",
+    )
+    parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed")
+
+    # Provide extra arguments required for tasks
+    group = parser.add_argument_group(title="Evaluation options")
+    group.add_argument("-d", "--eval_data_path", type=str, help="Path to eval data")
+    group.add_argument(
+        "--debug", action="store_true", default=False, help="Print infos."
+    )
+    group.add_argument(
+        "--overwrite",
+        action="store_true",
+        default=False,
+        help="Overwrite existed results",
+    )

    args = parser.parse_args()
    set_seed(args.seed)

-    main(args)
+    main(args)
--- a/eval/evaluate_cmmlu.py
+++ b/eval/evaluate_cmmlu.py
@ -11,39 +11,46 @@ from tqdm import tqdm
 from transformers.trainer_utils import set_seed


-'''
+"""
 wget https://huggingface.co/datasets/haonan-li/cmmlu/resolve/main/cmmlu_v1_0_1.zip
 mkdir data/cmmlu
 mv cmmlu_v1_0_1.zip data/cmmlu
 cd data/cmmlu; unzip cmmlu_v1_0_1.zip
 cd ../../
 python evaluate_cmmlu.py -d data/cmmlu/
-'''
+"""
+

 def load_models_tokenizer(args):
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from transformers.generation import GenerationConfig

-    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval()
-    model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint_path, device_map="auto", trust_remote_code=True
+    ).eval()
+    model.generation_config = GenerationConfig.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True
+    )
    return model, tokenizer


 def format_example(line, include_answer=True):
-    example = '问题：' + line['Question']
+    example = "问题：" + line["Question"]
    for choice in choices:
        example += f'\n{choice}. {line[f"{choice}"]}'

    if include_answer:
-        example += '\n答案：' + line["Answer"] + '\n\n'
+        example += "\n答案：" + line["Answer"] + "\n\n"
    else:
-        example += '\n答案：'
+        example += "\n答案："
    return example


 def generate_few_shot_prompt(k, subject, dev_df):
-    prompt = ''
+    prompt = ""
    if k == -1:
        k = dev_df.shape[0]
    for i in range(k):
@ -55,35 +62,37 @@ def generate_few_shot_prompt(k, subject, dev_df):


 def get_logits(tokenizer, model, inputs: List[str]):
-    input_ids = tokenizer(inputs, padding=False)['input_ids']
+    input_ids = tokenizer(inputs, padding=False)["input_ids"]
    input_ids = torch.tensor(input_ids, device=model.device)
-    tokens = {'input_ids': input_ids}
+    tokens = {"input_ids": input_ids}

-    outputs = model(input_ids)['logits']
+    outputs = model(input_ids)["logits"]
    logits = outputs[:, -1, :]
    log_probs = torch.nn.functional.softmax(logits, dim=-1)
-    return log_probs, {'tokens': tokens}
+    return log_probs, {"tokens": tokens}


@torch.no_grad()
 def eval_subject(
-        model,
-        tokenizer,
-        subject_name,
-        test_df,
-        k=5,
-        dev_df=None,
-        few_shot=False,
-        save_result_dir=None,
-        **kwargs
+    model,
+    tokenizer,
+    subject_name,
+    test_df,
+    k=5,
+    dev_df=None,
+    few_shot=False,
+    save_result_dir=None,
+    **kwargs,
 ):
    result = []
    score = []

-    few_shot_prompt = generate_few_shot_prompt(
-        k, subject_name, dev_df) if few_shot else []
-    all_probs = {'prob_A': [], 'prob_B': [], 'prob_C': [], 'prob_D': []}
-    if args.debug: print(f"few_shot_prompt: {few_shot_prompt}")
+    few_shot_prompt = (
+        generate_few_shot_prompt(k, subject_name, dev_df) if few_shot else []
+    )
+    all_probs = {"prob_A": [], "prob_B": [], "prob_C": [], "prob_D": []}
+    if args.debug:
+        print(f"few_shot_prompt: {few_shot_prompt}")

    for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
        question = format_example(row, include_answer=False)
@ -94,51 +103,56 @@ def eval_subject(
        logits = output.flatten()

        softval = torch.nn.functional.softmax(
-                torch.tensor(
-                    [
-                        logits[tokenizer("A")['input_ids']],
-                        logits[tokenizer("B")['input_ids']],
-                        logits[tokenizer("C")['input_ids']],
-                        logits[tokenizer("D")['input_ids']],
-                    ]
-                ),
-                dim=0,
-            )
+            torch.tensor(
+                [
+                    logits[tokenizer("A")["input_ids"]],
+                    logits[tokenizer("B")["input_ids"]],
+                    logits[tokenizer("C")["input_ids"]],
+                    logits[tokenizer("D")["input_ids"]],
+                ]
+            ),
+            dim=0,
+        )
        if softval.dtype in {torch.bfloat16, torch.float16}:
            softval = softval.to(dtype=torch.float32)
        probs = softval.detach().cpu().numpy()

        for i, choice in enumerate(choices):
-            all_probs[f'prob_{choice}'].append(probs[i])
+            all_probs[f"prob_{choice}"].append(probs[i])
        pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)]

-        if 'Answer' in row:
-            correct = 1 if pred == row['Answer'] else 0
+        if "Answer" in row:
+            correct = 1 if pred == row["Answer"] else 0
            score.append(correct)
-            if args.debug: print(f'{question} pred: {pred} ref: {row["Answer"]}')
+            if args.debug:
+                print(f'{question} pred: {pred} ref: {row["Answer"]}')
        result.append(pred)

    if score:
        correct_ratio = 100 * sum(score) / len(score)
-        if args.debug: print(subject_name, correct_ratio)
+        if args.debug:
+            print(subject_name, correct_ratio)
    else:
        correct_ratio = 0
    if save_result_dir:
-        test_df['model_output'] = result
+        test_df["model_output"] = result
        for i, choice in enumerate(choices):
-            test_df[f'prob_{choice}'] = (all_probs[f'prob_{choice}'])
+            test_df[f"prob_{choice}"] = all_probs[f"prob_{choice}"]
        if score:
            test_df["correctness"] = score
        os.makedirs(save_result_dir, exist_ok=True)
-        test_df.to_csv(os.path.join(
-            save_result_dir, f'{subject_name}_result.csv'), encoding="utf-8", index=False)
+        test_df.to_csv(
+            os.path.join(save_result_dir, f"{subject_name}_result.csv"),
+            encoding="utf-8",
+            index=False,
+        )

    return correct_ratio


 def cal_cmmlu(res):
-    print('\n\n\n')
-    res = {k.split('-')[-1]:float(v) for k,v in res.items()}
+    print("\n\n\n")
+    res = {k.split("-")[-1]: float(v) for k, v in res.items()}
    for k, v in TASK_NAME_MAPPING.items():
        avg_acc = np.mean(list(map(lambda x: res[x], v)))
        print(f"{k} acc: {avg_acc:.2f}")
@ -147,85 +161,103 @@ def cal_cmmlu(res):


 subcategories = {
-    "agronomy": ['other'],
-    "anatomy": ['biology'],
-    "ancient_chinese": ['linguistics','china specific'],
-    "arts": ['arts'],
-    "astronomy": ['physics'],
-    "business_ethics": ['business'],
-    "chinese_civil_service_exam": ['politics','china specific'],
-    "chinese_driving_rule": ['other','china specific'],
-    "chinese_food_culture": ['culture','china specific'],
-    "chinese_foreign_policy": ['politics','china specific'],
-    "chinese_history":['history','china specific'],
-    "chinese_literature": ['literature','china specific'],
-    "chinese_teacher_qualification": ['education','china specific'],
-    "college_actuarial_science":['math'],
-    "college_education":['education'],
-    "college_engineering_hydrology": ['engineering'],
-    "college_law": ['law'],
-    "college_mathematics": ['math'],
-    "college_medical_statistics":['statistics'],
-    "clinical_knowledge": ['other'],
-    "college_medicine": ['other'],
-    "computer_science": ['computer science'],
-    "computer_security": ['other'],
-    "conceptual_physics": ['physics'],
-    "construction_project_management": ['other','china specific'],
-    "economics": ['economics'],
-    "education": ['education'],
-    "elementary_chinese":['linguistics','china specific'],
-    "elementary_commonsense":['other','china specific'],
-    "elementary_information_and_technology": ['other'],
-    "electrical_engineering": ['engineering'],
-    "elementary_mathematics": ['math'],
-    "ethnology": ['culture','china specific'],
-    "food_science": ['other'],
-    "genetics": ['biology'],
-    "global_facts": ['global'],
-    "high_school_biology": ['biology'],
-    "high_school_chemistry": ['chemistry'],
-    "high_school_geography": ['geography'],
-    "high_school_mathematics": ['math'],
-    "high_school_physics": ['physics'],
-    "high_school_politics": ['politics','china specific'],
-    "human_sexuality": ['other'],
-    "international_law": ['law'],
-    "journalism": ['sociology'],
-    "jurisprudence": ['law'],
-    "legal_and_moral_basis": ['other'],
-    "logical": ['philosophy'],
-    "machine_learning": ['computer science'],
-    "management": ['business'],
-    "marketing": ['business'],
-    "marxist_theory": ['philosophy'],
-    "modern_chinese": ['linguistics','china specific'],
-    "nutrition": ['other'],
-    "philosophy": ['philosophy'],
-    "professional_accounting": ['business'],
-    "professional_law": ['law'],
-    "professional_medicine": ['other'],
-    "professional_psychology": ['psychology'],
-    "public_relations": ['politics'],
-    "security_study": ['politics'],
-    "sociology": ['culture'],
-    "sports_science": ['other'],
-    "traditional_chinese_medicine": ['other','china specific'],
-    "virology": ['biology'],
-    "world_history":['history'],
-    "world_religions": ['global'],
+    "agronomy": ["other"],
+    "anatomy": ["biology"],
+    "ancient_chinese": ["linguistics", "china specific"],
+    "arts": ["arts"],
+    "astronomy": ["physics"],
+    "business_ethics": ["business"],
+    "chinese_civil_service_exam": ["politics", "china specific"],
+    "chinese_driving_rule": ["other", "china specific"],
+    "chinese_food_culture": ["culture", "china specific"],
+    "chinese_foreign_policy": ["politics", "china specific"],
+    "chinese_history": ["history", "china specific"],
+    "chinese_literature": ["literature", "china specific"],
+    "chinese_teacher_qualification": ["education", "china specific"],
+    "college_actuarial_science": ["math"],
+    "college_education": ["education"],
+    "college_engineering_hydrology": ["engineering"],
+    "college_law": ["law"],
+    "college_mathematics": ["math"],
+    "college_medical_statistics": ["statistics"],
+    "clinical_knowledge": ["other"],
+    "college_medicine": ["other"],
+    "computer_science": ["computer science"],
+    "computer_security": ["other"],
+    "conceptual_physics": ["physics"],
+    "construction_project_management": ["other", "china specific"],
+    "economics": ["economics"],
+    "education": ["education"],
+    "elementary_chinese": ["linguistics", "china specific"],
+    "elementary_commonsense": ["other", "china specific"],
+    "elementary_information_and_technology": ["other"],
+    "electrical_engineering": ["engineering"],
+    "elementary_mathematics": ["math"],
+    "ethnology": ["culture", "china specific"],
+    "food_science": ["other"],
+    "genetics": ["biology"],
+    "global_facts": ["global"],
+    "high_school_biology": ["biology"],
+    "high_school_chemistry": ["chemistry"],
+    "high_school_geography": ["geography"],
+    "high_school_mathematics": ["math"],
+    "high_school_physics": ["physics"],
+    "high_school_politics": ["politics", "china specific"],
+    "human_sexuality": ["other"],
+    "international_law": ["law"],
+    "journalism": ["sociology"],
+    "jurisprudence": ["law"],
+    "legal_and_moral_basis": ["other"],
+    "logical": ["philosophy"],
+    "machine_learning": ["computer science"],
+    "management": ["business"],
+    "marketing": ["business"],
+    "marxist_theory": ["philosophy"],
+    "modern_chinese": ["linguistics", "china specific"],
+    "nutrition": ["other"],
+    "philosophy": ["philosophy"],
+    "professional_accounting": ["business"],
+    "professional_law": ["law"],
+    "professional_medicine": ["other"],
+    "professional_psychology": ["psychology"],
+    "public_relations": ["politics"],
+    "security_study": ["politics"],
+    "sociology": ["culture"],
+    "sports_science": ["other"],
+    "traditional_chinese_medicine": ["other", "china specific"],
+    "virology": ["biology"],
+    "world_history": ["history"],
+    "world_religions": ["global"],
 }

 categories = {
-    "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering", "statistics"],
+    "STEM": [
+        "physics",
+        "chemistry",
+        "biology",
+        "computer science",
+        "math",
+        "engineering",
+        "statistics",
+    ],
    "Humanities": ["history", "philosophy", "law", "arts", "literature", "global"],
-    "Social Science": ['linguistics',"business", "politics", "culture", "economics", "geography", "psychology", "education", "sociology"],
-    "Other":["other"],
+    "Social Science": [
+        "linguistics",
+        "business",
+        "politics",
+        "culture",
+        "economics",
+        "geography",
+        "psychology",
+        "education",
+        "sociology",
+    ],
+    "Other": ["other"],
    "China specific": ["china specific"],
 }

 TASK_NAME_MAPPING = defaultdict(list)
-for k,v in categories.items():
+for k, v in categories.items():
    for subject, subcat in subcategories.items():
        for c in subcat:
            if c in v:
@ -240,30 +272,52 @@ def main(args):

    test_result = {}
    for subject_name in tqdm(subcategories.keys()):
-        dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}.csv')
-        test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}.csv')
+        dev_file_path = os.path.join(args.eval_data_path, "dev", f"{subject_name}.csv")
+        test_file_path = os.path.join(
+            args.eval_data_path, "test", f"{subject_name}.csv"
+        )
        dev_df = pd.read_csv(dev_file_path)
        test_df = pd.read_csv(test_file_path)

-        score = eval_subject(model, tokenizer, subject_name, dev_df=dev_df, test_df=test_df, k=5, few_shot=True,
-                             save_result_dir=f"outs/cmmlu_eval_result")
+        score = eval_subject(
+            model,
+            tokenizer,
+            subject_name,
+            dev_df=dev_df,
+            test_df=test_df,
+            k=5,
+            few_shot=True,
+            save_result_dir=f"outs/cmmlu_eval_result",
+        )
        test_result[subject_name] = score
    cal_cmmlu(test_result)


-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Test HF checkpoint.')
-    parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B")
-    parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test HF checkpoint.")
+    parser.add_argument(
+        "-c",
+        "--checkpoint-path",
+        type=str,
+        help="Checkpoint path",
+        default="Qwen/Qwen-7B",
+    )
+    parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed")

    """Provide extra arguments required for tasks."""
-    group = parser.add_argument_group(title='Evaluation options')
-    group.add_argument('-d', '--eval_data_path', type=str, required=True,
-                       help='Path to eval data')
-    group.add_argument("--max-seq-len", type=int, default=2048,
-                       help='Size of the output generated text.')
-    group.add_argument("--debug", action='store_true', default=False,
-                       help='Print infos.')
+    group = parser.add_argument_group(title="Evaluation options")
+    group.add_argument(
+        "-d", "--eval_data_path", type=str, required=True, help="Path to eval data"
+    )
+    group.add_argument(
+        "--max-seq-len",
+        type=int,
+        default=2048,
+        help="Size of the output generated text.",
+    )
+    group.add_argument(
+        "--debug", action="store_true", default=False, help="Print infos."
+    )

    args = parser.parse_args()
    set_seed(args.seed)
--- a/eval/evaluate_gsm8k.py
+++ b/eval/evaluate_gsm8k.py
@ -1,15 +1,10 @@
-import random
-import tqdm
-import os
 import re
-import sys
 import torch
-import numpy as np
-import jsonlines
 import argparse
 import jsonlines
+import numpy as np
 import datasets
-from datasets import load_from_disk,load_dataset
+from datasets import load_from_disk, load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationConfig

@ -17,31 +12,37 @@ from transformers.generation import GenerationConfig
 ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
 INVALID_ANS = "[invalid]"

+
 def doc_to_text(doc):
-        return fewshot_prompt + "\nQuestion: " + doc["question"] + "\nLet's think step by step\n"
+    return (
+        fewshot_prompt
+        + "\nQuestion: "
+        + doc["question"]
+        + "\nLet's think step by step\n"
+    )
+

 def decode(tokens_list, tokenizer, raw_text_len):
    sents = []
    # print(len(tokens_list))
    for tokens in tokens_list:
        tokens = tokens.cpu().numpy().tolist()
-        sent = tokenizer.tokenizer.decode(
-            tokens[raw_text_len:])
-        sent = sent.split('<|endoftext|>')[0]
-        sent = sent.split('\n\n\n')[0]
+        sent = tokenizer.tokenizer.decode(tokens[raw_text_len:])
+        sent = sent.split("<|endoftext|>")[0]
+        sent = sent.split("\n\n\n")[0]
        sent = sent.split("\n\n")[0]
        sent = sent.split("Question:")[0]
        sents.append(sent)
    return sents

+
 def generate_sample(model, tokenizer, input_txt):
    input_ids = tokenizer.tokenizer.encode(input_txt)
    raw_text_len = len(input_ids)
-    context_enc = torch.tensor(
-                [input_ids]).to(model.device)
+    context_enc = torch.tensor([input_ids]).to(model.device)
    print(f"Input text: {input_txt}\n")
    outputs = model.generate(context_enc)
-    output_text = decode(outputs,tokenizer,raw_text_len)[0]
+    output_text = decode(outputs, tokenizer, raw_text_len)[0]
    print(f"\nOutput text: {output_text}\n")
    return output_text

@ -55,24 +56,34 @@ def extract_answer_hf(completion):
    else:
        return INVALID_ANS

+
 def extract_answer(completion):
    try:
-        last_number = re.findall(r'\d+', completion)[-1]
+        last_number = re.findall(r"\d+", completion)[-1]
        return eval(last_number)
    except:
        return INVALID_ANS

-def is_correct( completion, answer):
+
+def is_correct(completion, answer):
    gold = extract_answer_hf(answer)
    assert gold != INVALID_ANS, "No ground truth answer found in the document."
    return extract_answer(completion) == gold

-if __name__ == '__main__':

-    parser = argparse.ArgumentParser(description='Test HF checkpoint.')
-    parser.add_argument("-c", "--checkpoint-path", type=str, help="Checkpoint path", default="Qwen/Qwen-7B")
-    parser.add_argument("-f","--sample-input-file", type=str, default=None)
-    parser.add_argument("-o","--sample-output-file", type=str, default="gsm8k_res.jsonl")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test HF checkpoint.")
+    parser.add_argument(
+        "-c",
+        "--checkpoint-path",
+        type=str,
+        help="Checkpoint path",
+        default="Qwen/Qwen-7B",
+    )
+    parser.add_argument("-f", "--sample-input-file", type=str, default=None)
+    parser.add_argument(
+        "-o", "--sample-output-file", type=str, default="gsm8k_res.jsonl"
+    )

    args = parser.parse_args()

@ -80,31 +91,37 @@ if __name__ == '__main__':
    if args.sample_input_file is not None:
        dataset = load_from_disk(args.sample_input_file)
    else:
-        config = datasets.DownloadConfig(resume_download=True, max_retries=100) 
-        dataset = load_dataset("gsm8k", 'main', download_config=config)
+        config = datasets.DownloadConfig(resume_download=True, max_retries=100)
+        dataset = load_dataset("gsm8k", "main", download_config=config)

    test = dataset["test"]

-    print('Loading tokenizer ...')
-    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True)
-
-    print('Loading model ...')
-    model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval()
-    model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True)
+    print("Loading tokenizer ...")
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True
+    )
+
+    print("Loading model ...")
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint_path, device_map="auto", trust_remote_code=True
+    ).eval()
+    model.generation_config = GenerationConfig.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True
+    )
    model.generation_config.do_sample = False
-    
-    f_output = jsonlines.Writer(open(args.sample_output_file, 'w', encoding='utf-8'))
+
+    f_output = jsonlines.Writer(open(args.sample_output_file, "w", encoding="utf-8"))
    tot_length = test.num_rows
    acc_res = []
    for doc in test:
        context = doc_to_text(doc)
        completion = generate_sample(model, tokenizer, context)
-        answer= doc["answer"]
+        answer = doc["answer"]
        acc = is_correct(completion, answer)
-        doc["completion"]=completion
-        doc["acc"]=acc
+        doc["completion"] = completion
+        doc["acc"] = acc
        f_output.write(doc)
        acc_res.append(acc)
-    
+
    f_output.close()
-    print("Acc: ",np.mean(acc_res))
+    print("Acc: ", np.mean(acc_res))
--- a/eval/evaluate_humaneval.py
+++ b/eval/evaluate_humaneval.py
@ -1,11 +1,7 @@
-import random
+import argparse
 import tqdm
-import os
-import sys
 import torch
 import jsonlines
-import argparse
-import jsonlines
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationConfig

@ -15,56 +11,75 @@ $ pip install -e human-eval
 evaluate_functional_correctness sample-output-file
 """

+
 def decode(tokens_list, tokenizer, raw_text_len):
    sents = []
    # print(len(tokens_list))
    for tokens in tokens_list:
        tokens = tokens.cpu().numpy().tolist()
-        sent = tokenizer.tokenizer.decode(
-            tokens[raw_text_len:])
-        sent = sent.split('<|endoftext|>')[0]
-        sent = sent.split('\n\n\n')[0]
+        sent = tokenizer.tokenizer.decode(tokens[raw_text_len:])
+        sent = sent.split("<|endoftext|>")[0]
+        sent = sent.split("\n\n\n")[0]
        sent = sent.split("\n\n")[0]
        sent = sent.split("def ")[0]
        sents.append(sent)
    return sents

+
 def generate_sample(model, tokenizer, input_txt):
    input_ids = tokenizer.tokenizer.encode(input_txt)
    raw_text_len = len(input_ids)
-    context_enc = torch.tensor([input_ids] ).to(model.device)
+    context_enc = torch.tensor([input_ids]).to(model.device)
    print(f"Input text: {input_txt}\n")
    outputs = model.generate(context_enc)
-    output_text = decode(outputs,tokenizer,raw_text_len)[0]
+    output_text = decode(outputs, tokenizer, raw_text_len)[0]
    print(f"\nOutput text: \n{output_text}\n")
    return output_text


-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser(description='Test HF checkpoint.')
-    parser.add_argument("-c", "--checkpoint-path", type=str, help='Checkpoint path', default="Qwen/Qwen-7B")
-    parser.add_argument("-f","--sample-input-file", type=str, default=None, help="data path to HumanEval.jsonl")
-    parser.add_argument("-o","--sample-output-file", type=str, default="HumanEval_res.jsonl")
-
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test HF checkpoint.")
+    parser.add_argument(
+        "-c",
+        "--checkpoint-path",
+        type=str,
+        help="Checkpoint path",
+        default="Qwen/Qwen-7B",
+    )
+    parser.add_argument(
+        "-f",
+        "--sample-input-file",
+        type=str,
+        default=None,
+        help="data path to HumanEval.jsonl",
+    )
+    parser.add_argument(
+        "-o", "--sample-output-file", type=str, default="HumanEval_res.jsonl"
+    )

    args = parser.parse_args()
-    print('Loading tokenizer ...')
-    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True)
+    print("Loading tokenizer ...")
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True
+    )

-    print('Loading model ...')
-    model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval()
-    model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True)
+    print("Loading model ...")
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint_path, device_map="auto", trust_remote_code=True
+    ).eval()
+    model.generation_config = GenerationConfig.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True
+    )
    model.generation_config.do_sample = False
-    
-    f_output = jsonlines.Writer(open(args.sample_output_file, 'w', encoding='utf-8'))
+
+    f_output = jsonlines.Writer(open(args.sample_output_file, "w", encoding="utf-8"))

    f = jsonlines.open(args.sample_input_file)
    with f_output as output:
-        for jobj in tqdm.tqdm(f, desc='task_idx'):
-            prompt = jobj['prompt']
-            task_id = jobj['task_id']
+        for jobj in tqdm.tqdm(f, desc="task_idx"):
+            prompt = jobj["prompt"]
+            task_id = jobj["task_id"]
            gen_sents = generate_sample(model, tokenizer, prompt)
-            gen_jobjs = {'task_id': task_id, "completion": gen_sents} 
+            gen_jobjs = {"task_id": task_id, "completion": gen_sents}
            output.write(gen_jobjs)
-    f_output.close()
+    f_output.close()
--- a/eval/evaluate_mmlu.py
+++ b/eval/evaluate_mmlu.py
@ -1,57 +1,60 @@
 import os
+from typing import List
 import pandas as pd
 import numpy as np
 import argparse
-import datasets
 import torch
-
-from typing import List
 from tqdm import tqdm
 from transformers.trainer_utils import set_seed
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig

-
-'''
+"""
 wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
 mkdir data/mmlu
 mv data.tar data/mmlu
 cd data/mmlu; tar xf data.tar
 cd ../../
 python eval/evaluate_mmlu.py -d data/mmlu/data/
-'''
+"""


 def load_models_tokenizer(args):
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-    from transformers.generation import GenerationConfig
-
-    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval()
-    model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint_path, device_map="auto", trust_remote_code=True
+    ).eval()
+    model.generation_config = GenerationConfig.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True
+    )
    return model, tokenizer


 def format_example(line, include_answer=True):
-    example = 'Question: ' + line['question']
+    example = "Question: " + line["question"]
    for choice in choices:
        example += f'\n{choice}. {line[f"{choice}"]}'
-            
+
    if include_answer:
-        example += '\nAnswer: ' + line["answer"] + '\n\n'
+        example += "\nAnswer: " + line["answer"] + "\n\n"
    else:
-        example += '\nAnswer:'
+        example += "\nAnswer:"
    return example


 def generate_few_shot_prompt(k, subject, dev_df):
-
    def format_subject(subject):
        l = subject.split("_")
        s = ""
        for entry in l:
            s += " " + entry
        return s.strip()
-    
-    prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(format_subject(subject))
+
+    prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(
+        format_subject(subject)
+    )

    if k == -1:
        k = dev_df.shape[0]
@ -64,81 +67,87 @@ def generate_few_shot_prompt(k, subject, dev_df):


 def get_logits(tokenizer, model, inputs: List[str]):
-    input_ids = tokenizer(inputs, padding=False)['input_ids']
+    input_ids = tokenizer(inputs, padding=False)["input_ids"]
    input_ids = torch.tensor(input_ids, device=model.device)

    if input_ids.shape[1] > args.max_seq_len:
-        input_ids = input_ids[:, input_ids.shape[1]-args.max_seq_len+1:]
-    tokens = {'input_ids': input_ids}
+        input_ids = input_ids[:, input_ids.shape[1] - args.max_seq_len + 1 :]
+    tokens = {"input_ids": input_ids}

-    outputs = model(input_ids)['logits']
+    outputs = model(input_ids)["logits"]
    logits = outputs[:, -1, :]
    log_probs = torch.nn.functional.softmax(logits, dim=-1)
-    return log_probs, {'tokens': tokens}
+    return log_probs, {"tokens": tokens}


@torch.no_grad()
 def eval_subject(
-        model,
-        tokenizer,
-        subject_name,
-        test_df,
-        k=5,
-        dev_df=None,
-        few_shot=False,
-        save_result_dir=None,
-        **kwargs
+    model,
+    tokenizer,
+    subject_name,
+    test_df,
+    k=5,
+    dev_df=None,
+    few_shot=False,
+    save_result_dir=None,
+    **kwargs,
 ):
    result = []
    score = []

-    few_shot_prompt = generate_few_shot_prompt(
-        k, subject_name, dev_df) if few_shot else []
-    all_probs = {'prob_A': [], 'prob_B': [], 'prob_C': [], 'prob_D': []}
-    if args.debug: print(f"few_shot_prompt: {few_shot_prompt}")
+    few_shot_prompt = (
+        generate_few_shot_prompt(k, subject_name, dev_df) if few_shot else []
+    )
+    all_probs = {"prob_A": [], "prob_B": [], "prob_C": [], "prob_D": []}
+    if args.debug:
+        print(f"few_shot_prompt: {few_shot_prompt}")

    for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
        question = format_example(row, include_answer=False)
        full_prompt = few_shot_prompt + question
-        
+
        output, input_info = get_logits(tokenizer, model, [full_prompt])
        assert output.shape[0] == 1
        logits = output.flatten()

        softval = torch.nn.functional.softmax(
-                torch.tensor(
-                    [
-                        logits[tokenizer(" A")['input_ids']],
-                        logits[tokenizer(" B")['input_ids']],
-                        logits[tokenizer(" C")['input_ids']],
-                        logits[tokenizer(" D")['input_ids']],
-                    ]
-                ),
-                dim=0,
-            )
+            torch.tensor(
+                [
+                    logits[tokenizer(" A")["input_ids"]],
+                    logits[tokenizer(" B")["input_ids"]],
+                    logits[tokenizer(" C")["input_ids"]],
+                    logits[tokenizer(" D")["input_ids"]],
+                ]
+            ),
+            dim=0,
+        )
        if softval.dtype in {torch.bfloat16, torch.float16}:
            softval = softval.to(dtype=torch.float32)
        probs = softval.detach().cpu().numpy()

        for i, choice in enumerate(choices):
-            all_probs[f'prob_{choice}'].append(probs[i])
+            all_probs[f"prob_{choice}"].append(probs[i])
        pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)]

-        if 'answer' in row:
-            correct = 1 if pred == row['answer'] else 0
+        if "answer" in row:
+            correct = 1 if pred == row["answer"] else 0
            score.append(correct)
-            if args.debug: print(f'{question} pred: {pred} ref: {row["answer"]}')
+            if args.debug:
+                print(f'{question} pred: {pred} ref: {row["answer"]}')
        result.append(pred)

    if save_result_dir:
-        test_df['model_output'] = result
+        test_df["model_output"] = result
        for i, choice in enumerate(choices):
-            test_df[f'prob_{choice}'] = (all_probs[f'prob_{choice}'])
+            test_df[f"prob_{choice}"] = all_probs[f"prob_{choice}"]
        if score:
            test_df["correctness"] = score
        os.makedirs(save_result_dir, exist_ok=True)
-        test_df.to_csv(os.path.join(
-            save_result_dir, f'{subject_name}_result.csv'), encoding="utf-8", index=False)
+        test_df.to_csv(
+            os.path.join(save_result_dir, f"{subject_name}_result.csv"),
+            encoding="utf-8",
+            index=False,
+        )

    return score

@ -147,15 +156,15 @@ def cal_mmlu(res):
    acc_sum_dict = dict()
    acc_norm_sum_dict = dict()
    cnt_dict = dict()
-    acc_sum = 0.
+    acc_sum = 0.0
    cnt = 0
    hard_cnt = 0
-    hard_acc_sum = 0.
+    hard_acc_sum = 0.0

    for class_ in TASK_NAME_MAPPING.keys():
-        acc_sum_dict[class_] = 0.
-        acc_norm_sum_dict[class_] = 0.
-        cnt_dict[class_] = 0.
+        acc_sum_dict[class_] = 0.0
+        acc_norm_sum_dict[class_] = 0.0
+        cnt_dict[class_] = 0.0

        for tt in TASK_NAME_MAPPING[class_]:
            acc_sum += sum(res[tt])
@ -164,13 +173,12 @@ def cal_mmlu(res):
            acc_sum_dict[class_] += sum(res[tt])
            cnt_dict[class_] += len(res[tt])

-    print('\n\n\n', 'total cnt:', cnt, '\n')
+    print("\n\n\n", "total cnt:", cnt, "\n")
    for k in TASK_NAME_MAPPING.keys():
        if k in cnt_dict:
-            print('%s ACC: %.2f ' % (
-                k, acc_sum_dict[k] / cnt_dict[k] * 100))
-    print('AVERAGE ACC:%.2f ' % (acc_sum / cnt * 100))
-    
+            print("%s ACC: %.2f " % (k, acc_sum_dict[k] / cnt_dict[k] * 100))
+    print("AVERAGE ACC:%.2f " % (acc_sum / cnt * 100))
+

 def main(args):
    model, tokenizer = load_models_tokenizer(args)
@ -178,41 +186,130 @@ def main(args):
    dev_result = {}
    for subject_name in tqdm(SUBJECTS):
        # val_file_path = os.path.join(args.eval_data_path, 'val', f'{subject_name}_val.csv')
-        dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}_dev.csv')
-        test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}_test.csv')
+        dev_file_path = os.path.join(
+            args.eval_data_path, "dev", f"{subject_name}_dev.csv"
+        )
+        test_file_path = os.path.join(
+            args.eval_data_path, "test", f"{subject_name}_test.csv"
+        )
        # val_df = pd.read_csv(val_file_path, names=['question','A','B','C','D','answer'])
-        dev_df = pd.read_csv(dev_file_path, names=['question','A','B','C','D','answer'])
-        test_df = pd.read_csv(test_file_path, names=['question','A','B','C','D','answer'])
+        dev_df = pd.read_csv(
+            dev_file_path, names=["question", "A", "B", "C", "D", "answer"]
+        )
+        test_df = pd.read_csv(
+            test_file_path, names=["question", "A", "B", "C", "D", "answer"]
+        )

-        score = eval_subject(model, tokenizer, subject_name, test_df, dev_df=dev_df, k=5, few_shot=True,
-                             save_result_dir=f"outs/mmlu_eval_result")
+        score = eval_subject(
+            model,
+            tokenizer,
+            subject_name,
+            test_df,
+            dev_df=dev_df,
+            k=5,
+            few_shot=True,
+            save_result_dir=f"outs/mmlu_eval_result",
+        )
        dev_result[subject_name] = score
    cal_mmlu(dev_result)


-TASK_NAME_MAPPING = {'stem': ['abstract_algebra', 'anatomy', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning'],
- 'Humanities': ['formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions'],
- 'other': ['business_ethics', 'college_medicine', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology', 'global_facts', 'clinical_knowledge'],
- 'social': ['econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy']}
+TASK_NAME_MAPPING = {
+    "stem": [
+        "abstract_algebra",
+        "anatomy",
+        "astronomy",
+        "college_biology",
+        "college_chemistry",
+        "college_computer_science",
+        "college_mathematics",
+        "college_physics",
+        "computer_security",
+        "conceptual_physics",
+        "electrical_engineering",
+        "elementary_mathematics",
+        "high_school_biology",
+        "high_school_chemistry",
+        "high_school_computer_science",
+        "high_school_mathematics",
+        "high_school_physics",
+        "high_school_statistics",
+        "machine_learning",
+    ],
+    "Humanities": [
+        "formal_logic",
+        "high_school_european_history",
+        "high_school_us_history",
+        "high_school_world_history",
+        "international_law",
+        "jurisprudence",
+        "logical_fallacies",
+        "moral_disputes",
+        "moral_scenarios",
+        "philosophy",
+        "prehistory",
+        "professional_law",
+        "world_religions",
+    ],
+    "other": [
+        "business_ethics",
+        "college_medicine",
+        "human_aging",
+        "management",
+        "marketing",
+        "medical_genetics",
+        "miscellaneous",
+        "nutrition",
+        "professional_accounting",
+        "professional_medicine",
+        "virology",
+        "global_facts",
+        "clinical_knowledge",
+    ],
+    "social": [
+        "econometrics",
+        "high_school_geography",
+        "high_school_government_and_politics",
+        "high_school_macroeconomics",
+        "high_school_microeconomics",
+        "high_school_psychology",
+        "human_sexuality",
+        "professional_psychology",
+        "public_relations",
+        "security_studies",
+        "sociology",
+        "us_foreign_policy",
+    ],
+}
 SUBJECTS = [v for vl in TASK_NAME_MAPPING.values() for v in vl]
 choices = ["A", "B", "C", "D"]

-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Test HF checkpoint.')
-    parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B")
-    parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed')
-    parser.add_argument('--gpu', type=int, default=0, help='gpu id')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test HF checkpoint.")
+    parser.add_argument(
+        "-c",
+        "--checkpoint-path",
+        type=str,
+        help="Checkpoint path",
+        default="Qwen/Qwen-7B",
+    )
+    parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed")
+    parser.add_argument("--gpu", type=int, default=0, help="gpu id")

    """Provide extra arguments required for tasks."""
-    group = parser.add_argument_group(title='Evaluation options')
-    group.add_argument('-d', '--eval_data_path', type=str,
-                       help='Path to eval data')
-    group.add_argument("--max-seq-len", type=int, default=2048,
-                       help='Size of the output generated text.')
-    group.add_argument("--debug", action='store_true', default=False,
-                       help='Print infos.')
+    group = parser.add_argument_group(title="Evaluation options")
+    group.add_argument("-d", "--eval_data_path", type=str, help="Path to eval data")
+    group.add_argument(
+        "--max-seq-len",
+        type=int,
+        default=2048,
+        help="Size of the output generated text.",
+    )
+    group.add_argument(
+        "--debug", action="store_true", default=False, help="Print infos."
+    )

    args = parser.parse_args()
    set_seed(args.seed)

-    main(args)
+    main(args)
--- a/eval/evaluate_plugin.py
+++ b/eval/evaluate_plugin.py
@ -12,47 +12,48 @@ from transformers.generation import GenerationConfig
 from transformers.tools.evaluate_agent import evaluate_agent
 from transformers.trainer_utils import set_seed

-data_root_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                              'data')
+data_root_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")


 def is_callable(response, golden):
-    return response['action'].strip().lower() == golden['action'].strip(
-    ).lower()
+    return response["action"].strip().lower() == golden["action"].strip().lower()


 def process_res(response):
    # parse response
-    response += '\n'  # fix not-find bug
-    thought = response[:response.find('Action:')].strip()
-    action = response[response.find('Action:') +
-                      len('Action:'):response.find('Action Input:')].strip()
-    action_input = response[response.find('Action Input:') +
-                            len('Action Input:'):response.find('Observation:'
-                                                               )].strip()
-    #TODO: This parsing result is incorrect if the response contains multiple Actions. To be fixed in the future.
-    observation = response[response.find('Observation:') +
-                           len('Observation:'):response.rfind('Thought:'
-                                                              )].strip()
-    thought_last = response[response.rfind('Thought:') +
-                            len('Thought:'):response.find('Final Answer:'
-                                                          )].strip()
-    final_answer = response[response.find('Final Answer:') +
-                            len('Final Answer:'):].strip()
+    response += "\n"  # fix not-find bug
+    thought = response[: response.find("Action:")].strip()
+    action = response[
+        response.find("Action:") + len("Action:") : response.find("Action Input:")
+    ].strip()
+    action_input = response[
+        response.find("Action Input:")
+        + len("Action Input:") : response.find("Observation:")
+    ].strip()
+    # TODO: This parsing result is incorrect if the response contains multiple Actions. To be fixed in the future.
+    observation = response[
+        response.find("Observation:") + len("Observation:") : response.rfind("Thought:")
+    ].strip()
+    thought_last = response[
+        response.rfind("Thought:") + len("Thought:") : response.find("Final Answer:")
+    ].strip()
+    final_answer = response[
+        response.find("Final Answer:") + len("Final Answer:") :
+    ].strip()
    try:
-        action_input = json.dumps(json5.loads(action_input),
-                                  ensure_ascii=False,
-                                  sort_keys=True)
+        action_input = json.dumps(
+            json5.loads(action_input), ensure_ascii=False, sort_keys=True
+        )
    except:
        # print("JSON Load Error:", action_input)
        pass
    res_dict = {
-        'thought': thought,
-        'action': action,
-        'action_input': action_input,
-        'observation': observation,
-        'thought_last': thought_last,
-        'final_answer': final_answer
+        "thought": thought,
+        "action": action,
+        "action_input": action_input,
+        "observation": observation,
+        "thought_last": thought_last,
+        "final_answer": final_answer,
    }
    return res_dict

@ -68,20 +69,18 @@ def _get_tokenized_string(tokenizer, text_list):
        assert tokenizer is not None
        token_ids = tokenizer.encode(text)
        tokens_bytes = tokenizer.convert_ids_to_tokens(token_ids)
-        tokens = [
-            token.decode('utf-8', errors='replace') for token in tokens_bytes
-        ]
-        tokenized_string = ' '.join(tokens)
+        tokens = [token.decode("utf-8", errors="replace") for token in tokens_bytes]
+        tokenized_string = " ".join(tokens)
        token_ids_list.append(token_ids)
        tokenized_string_list.append(tokenized_string)
    return token_ids_list, tokenized_string_list


 def eval_action(job):
-    response = job['gen'][0]
-    golden = job['response']
+    response = job["gen"][0]
+    golden = job["response"]

-    if 'Action:' in response:
+    if "Action:" in response:
        response, golden = process_res(response), process_res(golden)
        if is_callable(response, golden):
            return True
@ -89,26 +88,29 @@ def eval_action(job):


 def eval_action_input(job, tokenizer):
-    response = job['gen'][0]
-    golden = job['response']
+    response = job["gen"][0]
+    golden = job["response"]
    response, golden = process_res(response), process_res(golden)
-    query = job['prompt']
+    query = job["prompt"]

    job = {}
-    job['prompt'] = query
-    job['gen'] = response['action_input']
-    job['response'] = golden['action_input']
+    job["prompt"] = query
+    job["gen"] = response["action_input"]
+    job["response"] = golden["action_input"]

-    job['_gen_tok'], job['_gen_tok_str'] = _get_tokenized_string(
-        tokenizer, [response['action_input']])
-    job['_reference_tok'], job['_reference_tok_str'] = _get_tokenized_string(
-        tokenizer, [golden['action_input']])
+    job["_gen_tok"], job["_gen_tok_str"] = _get_tokenized_string(
+        tokenizer, [response["action_input"]]
+    )
+    job["_reference_tok"], job["_reference_tok_str"] = _get_tokenized_string(
+        tokenizer, [golden["action_input"]]
+    )

-    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'],
-                                      tokenizer=_DummyTokenizer())
-    score = scorer.score(job['_reference_tok_str'][0], job['_gen_tok_str'][0])
+    scorer = rouge_scorer.RougeScorer(
+        ["rouge1", "rouge2", "rougeL"], tokenizer=_DummyTokenizer()
+    )
+    score = scorer.score(job["_reference_tok_str"][0], job["_gen_tok_str"][0])

-    rouge = score['rougeL'].fmeasure
+    rouge = score["rougeL"].fmeasure

    return rouge

@ -124,24 +126,33 @@ class QWenAgent(Agent):
    agent.run("Draw me a picture of rivers and lakes.")
    ```
    """
-    def __init__(self,
-                 chat_prompt_template=None,
-                 run_prompt_template=None,
-                 additional_tools=None,
-                 tokenizer=None,
-                 model=None):
+
+    def __init__(
+        self,
+        chat_prompt_template=None,
+        run_prompt_template=None,
+        additional_tools=None,
+        tokenizer=None,
+        model=None,
+    ):
        if tokenizer and model:
            self.tokenizer = tokenizer
            self.model = model
        else:
-            checkpoint = 'Qwen/Qwen-7B-Chat'
+            checkpoint = "Qwen/Qwen-7B-Chat"
            self.tokenizer = AutoTokenizer.from_pretrained(
-                checkpoint, trust_remote_code=True)
-            self.model = AutoModelForCausalLM.from_pretrained(
-                checkpoint, device_map='auto',
-                trust_remote_code=True).cuda().eval()
+                checkpoint, trust_remote_code=True
+            )
+            self.model = (
+                AutoModelForCausalLM.from_pretrained(
+                    checkpoint, device_map="auto", trust_remote_code=True
+                )
+                .cuda()
+                .eval()
+            )
            self.model.generation_config = GenerationConfig.from_pretrained(
-                checkpoint, trust_remote_code=True)  # 可指定不同的生成长度、top_p等相关超参
+                checkpoint, trust_remote_code=True
+            )  # 可指定不同的生成长度、top_p等相关超参
            self.model.generation_config.do_sample = False  # greedy

        super().__init__(
@ -152,155 +163,161 @@ class QWenAgent(Agent):

    def generate_one(self, prompt, stop):
        # "Human:" 和 "Assistant:" 曾为通义千问的特殊保留字，需要替换为 "_HUMAN_:" 和 "_ASSISTANT_:"。这一问题将在未来版本修复。
-        prompt = prompt.replace('Human:',
-                                '_HUMAN_:').replace('Assistant:',
-                                                    '_ASSISTANT_:')
+        prompt = prompt.replace("Human:", "_HUMAN_:").replace(
+            "Assistant:", "_ASSISTANT_:"
+        )
        stop = [
-            item.replace('Human:', '_HUMAN_:').replace('Assistant:',
-                                                       '_ASSISTANT_:')
+            item.replace("Human:", "_HUMAN_:").replace("Assistant:", "_ASSISTANT_:")
            for item in stop
        ]

        result, _ = self.model.chat(self.tokenizer, prompt, history=None)
        for stop_seq in stop:
            if result.endswith(stop_seq):
-                result = result[:-len(stop_seq)]
+                result = result[: -len(stop_seq)]

-        result = result.replace('_HUMAN_:',
-                                'Human:').replace('_ASSISTANT_:', 'Assistant:')
+        result = result.replace("_HUMAN_:", "Human:").replace(
+            "_ASSISTANT_:", "Assistant:"
+        )
        return result


 def load_models_tokenizer(args):
-    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path,
-                                              trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path,
-                                                 device_map='auto',
-                                                 trust_remote_code=True,
-                                                 bf16=True,
-                                                 use_flash_attn=True).eval()
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.checkpoint_path, trust_remote_code=True
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint_path,
+        device_map="auto",
+        trust_remote_code=True,
+        bf16=True,
+        use_flash_attn=True,
+    ).eval()
    model.generation_config = GenerationConfig.from_pretrained(
-        args.checkpoint_path, trust_remote_code=True)
+        args.checkpoint_path, trust_remote_code=True
+    )
    model.generation_config.do_sample = False  # use greedy decoding
    return model, tokenizer


 def load_jobs(filename):
    jobs = []
-    with jsonlines.open(os.path.join(data_root_path, filename),
-                        mode='r') as reader:
+    with jsonlines.open(os.path.join(data_root_path, filename), mode="r") as reader:
        for job in reader:
            jobs.append(job)
    return jobs


 def react_inference(filename, model, tokenizer):
-    filename_cache = filename + '.cache'
+    filename_cache = filename + ".cache"
    if os.path.exists(os.path.join(data_root_path, filename_cache)):
        jobs = load_jobs(filename=filename_cache)
-        print('Loaded from', filename_cache)
+        print("Loaded from", filename_cache)
    else:
-        with open(os.path.join(data_root_path, filename_cache), 'w') as f:
+        with open(os.path.join(data_root_path, filename_cache), "w") as f:
            jobs = load_jobs(filename=filename)
-            print('Inference:', filename)
+            print("Inference:", filename)
            for job in tqdm(jobs):
-                response, history = model.chat(tokenizer,
-                                               job['prompt'],
-                                               history=None)
-                job['gen'] = [response]
-                f.writelines(json.dumps(job, ensure_ascii=False) + '\n')
-        print(filename_cache, 'is saved.')
+                response, history = model.chat(tokenizer, job["prompt"], history=None)
+                job["gen"] = [response]
+                f.writelines(json.dumps(job, ensure_ascii=False) + "\n")
+        print(filename_cache, "is saved.")
    return jobs


 def main(args):
-    print('loading model weights')
+    print("loading model weights")
    if args.checkpoint_path is not None:
        model, tokenizer = load_models_tokenizer(args)
    else:
        model, tokenizer = None, None
-    print('model loaded')
+    print("model loaded")

    result = {}
    # eval react positive
    if args.eval_react_positive:
-        print('eval react positive ...')
+        print("eval react positive ...")
        acc_count = 0
        rouge_mean = 0
-        jobs = react_inference(filename=args.eval_react_positive_filename,
-                               model=model,
-                               tokenizer=tokenizer)
+        jobs = react_inference(
+            filename=args.eval_react_positive_filename, model=model, tokenizer=tokenizer
+        )
        for job in jobs:
            if eval_action(job):
                acc_count += 1
            rouge = eval_action_input(job, tokenizer)
-            rouge_mean += (rouge / len(jobs))
+            rouge_mean += rouge / len(jobs)

        scores = {
-            'action_right_rate': acc_count / len(jobs),
-            'action_input_rouge': rouge_mean,
+            "action_right_rate": acc_count / len(jobs),
+            "action_input_rouge": rouge_mean,
        }

-        result.update({'react_positive': scores})
+        result.update({"react_positive": scores})

    # eval react negative
    if args.eval_react_negative:
-        print('eval react negative ...')
+        print("eval react negative ...")
        bad_count = 0
-        jobs = react_inference(filename=args.eval_react_negative_filename,
-                               model=model,
-                               tokenizer=tokenizer)
+        jobs = react_inference(
+            filename=args.eval_react_negative_filename, model=model, tokenizer=tokenizer
+        )
        for job in jobs:
-            if '\nAction:' in job['gen'][0]:
+            if "\nAction:" in job["gen"][0]:
                bad_count += 1
-        scores = {'bad_rate': bad_count / len(jobs)}
-        result.update({'react_negative': scores})
+        scores = {"bad_rate": bad_count / len(jobs)}
+        result.update({"react_negative": scores})

    # eval hfagent
    if args.eval_hfagent:
-        print('eval hfagent ...')
+        print("eval hfagent ...")
        agent = QWenAgent(model=model, tokenizer=tokenizer)
        scores = evaluate_agent(agent, verbose=False, return_errors=False)
-        result.update({'hfagent': scores})
+        result.update({"hfagent": scores})

    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(result)


-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Test HF checkpoint.')
-    parser.add_argument('-c',
-                        '--checkpoint-path',
-                        type=str,
-                        help='Checkpoint path',
-                        default='Qwen/Qwen-7B-Chat')
-    parser.add_argument('-s',
-                        '--seed',
-                        type=int,
-                        default=1234,
-                        help='Random seed')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test HF checkpoint.")
+    parser.add_argument(
+        "-c",
+        "--checkpoint-path",
+        type=str,
+        help="Checkpoint path",
+        default="Qwen/Qwen-7B-Chat",
+    )
+    parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed")
    """Provide extra arguments required for tasks."""
-    group = parser.add_argument_group(title='Evaluation options')
-    group.add_argument('--eval-react-positive',
-                       action='store_true',
-                       default=False,
-                       help='Eval react positive.')
-    group.add_argument('--eval-react-positive-filename',
-                       type=str,
-                       default='exam_plugin_v1_react_positive.jsonl',
-                       help='Eval react positive filename.')
-    group.add_argument('--eval-react-negative',
-                       action='store_true',
-                       default=False,
-                       help='Eval react negative.')
-    group.add_argument('--eval-react-negative-filename',
-                       type=str,
-                       default='exam_plugin_v1_react_negative.jsonl',
-                       help='Eval react negative filename.')
-    group.add_argument('--eval-hfagent',
-                       action='store_true',
-                       default=False,
-                       help='Eval hfagent.')
+    group = parser.add_argument_group(title="Evaluation options")
+    group.add_argument(
+        "--eval-react-positive",
+        action="store_true",
+        default=False,
+        help="Eval react positive.",
+    )
+    group.add_argument(
+        "--eval-react-positive-filename",
+        type=str,
+        default="exam_plugin_v1_react_positive.jsonl",
+        help="Eval react positive filename.",
+    )
+    group.add_argument(
+        "--eval-react-negative",
+        action="store_true",
+        default=False,
+        help="Eval react negative.",
+    )
+    group.add_argument(
+        "--eval-react-negative-filename",
+        type=str,
+        default="exam_plugin_v1_react_negative.jsonl",
+        help="Eval react negative filename.",
+    )
+    group.add_argument(
+        "--eval-hfagent", action="store_true", default=False, help="Eval hfagent."
+    )

    args = parser.parse_args()
    set_seed(args.seed)
--- a/examples/auto_comments.md
+++ b/examples/auto_comments.md
@ -0,0 +1,59 @@
+# Auto Comments 
+本文档介绍Auto Comments，这是一个利用Qwen模型为代码文件自动生成注释的使用案例。
+
+# 使用方法
+您可以直接执行如下命令，为提供的代码文件生成注释：
+```
+python auto_comments.py --path 'path of file or folder'
+```
+
+参数：
+- path：文件路径。可以是文件（目前支持python代码文件），也可以是文件夹（会扫描文件夹下所有python代码文件）
+- regenerate：重新生成。默认False，如果针对同一文件需要重新生成注释，请设置为True
+
+# 使用样例
+- 执行：python auto_comments.py --path test_file.py
+- test_file.py 内容为：
+```
+import numpy as np
+import pandas as pd
+import seaborn as sns
+sns.set_theme(style="whitegrid")
+
+rs = np.random.RandomState(365)
+values = rs.randn(365, 4).cumsum(axis=0)
+dates = pd.date_range("1 1 2016", periods=365, freq="D")
+data = pd.DataFrame(values, dates, columns=["A", "B", "C", "D"])
+data = data.rolling(7).mean()
+
+sns.lineplot(data=data, palette="tab10", linewidth=2.5)
+```
+
+- 输出：test_file_comments.py(包含注释的代码文件)，文件内容如下：
+```
+# 导入需要的库
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+# 设置 Seaborn 的主题风格为白色网格
+sns.set_theme(style="whitegrid")
+
+# 生成随机数
+rs = np.random.RandomState(365)
+
+# 生成 365 行 4 列的随机数，并按行累加
+values = rs.randn(365, 4).cumsum(axis=0)
+
+# 生成日期
+dates = pd.date_range("1 1 2016", periods=365, freq="D")
+
+# 将随机数和日期组合成 DataFrame
+data = pd.DataFrame(values, dates, columns=["A", "B", "C", "D"])
+
+# 对 DataFrame 进行 7 天滑动平均
+data = data.rolling(7).mean()
+
+# 使用 Seaborn 绘制折线图
+sns.lineplot(data=data, palette="tab10", linewidth=2.5)
+```
--- a/examples/auto_comments.py
+++ b/examples/auto_comments.py
@ -0,0 +1,189 @@
+# 运行方式：python auto_comments.py --path 'path of file or folder'
+# 脚本功能：使用QWen-7B-Chat为提供的代码文件自动生成注释。(详见auto_comments.md)
+
+
+import argparse
+import os
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+
+MaxLine = 50 # 限制单次处理最大代码行数
+SplitKey = ["\ndef "] # 自定义的切分代码标识
+CodeFileType = ["py"] # 目前仅测试过对python文件生成注释
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--path', type=str, default='Qwen-7B/eval/evaluate_ceval.py')
+    parser.add_argument('--regenerate', action='store_true', default=False) #如果已经生成过注释，默认不会重新生成
+    args = parser.parse_args()
+    return args
+
+class QWenChat():
+    def __init__(self):
+        self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
+
+        # use bf16
+        # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, bf16=True).eval()
+        # use fp16
+        # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, fp16=True).eval()
+        # use cpu only
+        # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="cpu", trust_remote_code=True).eval()
+        # use auto mode, automatically select precision based on the device.
+        self.model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True).eval()
+        
+        # Specify hyperparameters for generation
+        self.model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
+        self.history = None
+        
+    def chat(self, query, system = ""):
+
+        # use history
+        # response, history = self.model.chat(self.tokenizer, query, history=self.history)
+
+        # 默认不使用history
+        response, history = self.model.chat(self.tokenizer, query, history=None)
+        self.history = history
+
+        return response
+# 生成注释
+def gen_code_comments(context, model = None, **kwargs):
+    prompt = "\n为以上代码生成细致的中文注释，注意使用合适的语法。要求必须在每个函数开头生成一段统一的函数功能注释。\n除了注释，请保证原始代码内容不变。不要返回除了注释和代码以外的其余信息，不要生成额外代码。\n"
+    return model.chat(context + prompt)
+
+def read_file(path):
+    f = open(path, "r",encoding='utf-8')
+    lines = f.readlines()
+    return "".join(lines)
+
+def write_file(path, context):
+    with open(path,'w') as f:
+        f.write(context)
+
+# 如果代码文件过长，可以简单按照最大行数切分代码
+def split_context_by_maxline(text):
+    lines = text.split("\n")
+    lines_len = len(lines)
+    res = []
+    for i in range(MaxLine, lines_len, MaxLine):
+        res.append("\n".join(lines[i-MaxLine:i]))
+
+    if i < lines_len:
+        res.append("\n".join(lines[i:]))
+    return res
+
+# 如果代码文件过长，可以简单按照函数切分代码
+def split_context_by_splitkey(text):
+    blocks = text.split(SplitKey[0])
+    return [blocks[0]] + [SplitKey[0]+x for x in blocks[1:]]
+
+# merge原始代码和生成的注释，目的是保证原始代码不被更改。这部分可以使用各种不同的策略处理。
+def merge_code_and_comments(original_file, comments_path):
+    res = []
+    ori_f = open(original_file, "r",encoding='utf-8')
+    ori_lines = ori_f.readlines()
+
+    com_f = open(comments_path, "r",encoding='utf-8')
+    com_lines = com_f.readlines()
+    len_com_lines = len(com_lines)
+    p = 0
+    j = 0
+    for i, line in enumerate(ori_lines):
+        if line.isspace():
+            continue
+        if line.strip()[0] == '#':
+            res.append(line)
+            continue
+        while j < len_com_lines and line[:-1] not in com_lines[j]:
+            j += 1
+        if j < len_com_lines:
+            p = j - 1
+            up_comments = []
+            triple_dot_flag = 0
+            while p < j:
+                if p < 0 or (res and res[-1] and com_lines[p] == res[-1]):
+                    break
+                if com_lines[p].strip() and (len(com_lines[p].strip())>3 and com_lines[p].strip()[-3:] == '"""' and com_lines[p].strip()[:3] == '"""') or (len(com_lines[p].strip())>3 and com_lines[p].strip()[-3:] == "'''" and com_lines[p].strip()[:3] == "'''"):
+                    up_comments.append(com_lines[p])
+                    p -= 1
+                    continue
+                if com_lines[p].strip() and (com_lines[p].strip()[-3:] == '"""' or com_lines[p].strip()[:3] == '"""' or com_lines[p].strip()[-3:] == "'''" or com_lines[p].strip()[:3] == "'''"):
+                    triple_dot_flag = (triple_dot_flag + 1)%2
+                    up_comments.append(com_lines[p])
+                    p -= 1
+                    continue
+                if triple_dot_flag:
+                    up_comments.append(com_lines[p])
+                    p -= 1
+                    continue
+                if (com_lines[p].strip()=="") or (com_lines[p].strip() and com_lines[p].strip()[0] == '#' and "省略部分内容" not in com_lines[p]):
+                    up_comments.append(com_lines[p])
+                else:
+                    break
+                p -= 1
+            if up_comments:
+                res.extend(reversed(up_comments))
+            if "#" in com_lines[j] and "#" not in line:
+                in_line_comments = "  #" + com_lines[j].split("#")[-1]
+                res.append(line[:-1]+in_line_comments)
+            else:
+                res.append(line)
+            p = j+1
+        else:
+            res.append(line)
+            j = p
+
+    write_file(comments_path, "".join(res))
+
+# 处理单个文件
+def deal_one_file(model, path, args):
+    context = read_file(path)
+
+    fname = path.split("/")[-1]
+    fpath = "/".join(path.split("/")[:-1])
+    outfname = fname.split(".")[0]+"_comments."+fname.split(".")[-1]
+
+    comments_path = os.path.join(fpath, outfname)
+    if (not args.regenerate) and os.path.exists(comments_path):
+        print("use cache: ", comments_path)
+        return
+
+    context_line = len(context.split("\n"))
+    if context_line < MaxLine:
+        res = gen_code_comments(context, model = model)
+    elif SplitKey[0] not in context:
+        context_list = split_context_by_maxline(context)
+        res = "\n".join([gen_code_comments(context_block, model = model) for context_block in context_list])
+    else:
+        context_list = split_context_by_splitkey(context)
+        res = "\n".join([gen_code_comments(context_block, model = model) for context_block in context_list])
+
+    write_file(comments_path, res)
+    merge_code_and_comments(path, comments_path)
+
+# 处理文件夹
+def deal_folder(model, path, args):
+    for fl in os.listdir(path):
+        now_path = os.path.join(path, fl)
+        if os.path.isfile(now_path):
+            if (now_path.split(".")[-1] in CodeFileType) and ("_comments" not in now_path):
+                deal_one_file(model, now_path, args)
+        elif os.path.isdir(now_path):
+            deal_folder(model, now_path, args)
+        else:
+            print("Please specify a correct path!")
+
+def transfer(args):
+    model = QWenChat()
+
+    if os.path.isfile(args.path):
+        if (args.path.split(".")[-1] in CodeFileType) and ("_comments" not in args.path):
+            deal_one_file(model, args.path, args)
+    elif os.path.isdir(args.path):
+        deal_folder(model, args.path, args)
+    else:
+        print("Please specify a correct path!")
+
+if __name__ == '__main__':
+    args = parse_args()
+    print(args)
+    transfer(args)
--- a/openai_api.py
+++ b/openai_api.py
@ -68,6 +68,7 @@ class ChatCompletionRequest(BaseModel):
    top_p: Optional[float] = None
    max_length: Optional[int] = None
    stream: Optional[bool] = False
+    stop: Optional[List[str]] = []


 class ChatCompletionResponseChoice(BaseModel):
@ -103,7 +104,8 @@ async def create_chat_completion(request: ChatCompletionRequest):
    if request.messages[-1].role != "user":
        raise HTTPException(status_code=400, detail="Invalid request")
    query = request.messages[-1].content
-
+    stop_words = request.stop
+    stop_words.extend(list(map(lambda x: x[1:], filter(lambda x: x.startswith("\n"), stop_words))))
    prev_messages = request.messages[:-1]
    # Temporarily, the system role does not work as expected. We advise that you write the setups for role-play in your query.
    # if len(prev_messages) > 0 and prev_messages[0].role == "system":
@ -120,10 +122,18 @@ async def create_chat_completion(request: ChatCompletionRequest):
        raise HTTPException(status_code=400, detail="Invalid request.")

    if request.stream:
-        generate = predict(query, history, request.model)
+        generate = predict(query, history, request.model, stop_words)
        return EventSourceResponse(generate, media_type="text/event-stream")

-    response, _ = model.chat(tokenizer, query, history=history)
+    if stop_words:
+        react_stop_words_tokens = [tokenizer.encode(stop_) for stop_ in stop_words]
+        response, _ = model.chat(tokenizer, query, history=history, stop_words_ids=react_stop_words_tokens)
+        for stop_ in stop_words:
+            if response.endswith(stop_):
+                response = response[:response.find(stop_)]
+    else:
+        response, _ = model.chat(tokenizer, query, history=history)
+
    choice_data = ChatCompletionResponseChoice(
        index=0,
        message=ChatMessage(role="assistant", content=response),
@ -133,9 +143,9 @@ async def create_chat_completion(request: ChatCompletionRequest):
    return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion")


-async def predict(query: str, history: List[List[str]], model_id: str):
+async def predict(query: str, history: List[List[str]], model_id: str, stop_words: List[str]):
    global model, tokenizer
-
+    assert stop_words == [], "in stream format, stop word is output"
    choice_data = ChatCompletionResponseStreamChoice(
        index=0,
        delta=DeltaMessage(role="assistant"),
@ -145,8 +155,13 @@ async def predict(query: str, history: List[List[str]], model_id: str):
    yield "{}".format(chunk.model_dump_json(exclude_unset=True))

    current_length = 0
+    if stop_words:
+        react_stop_words_tokens = [tokenizer.encode(stop_) for stop_ in stop_words]
+        response_generator = model.chat_stream(tokenizer, query, history=history, stop_words_ids=react_stop_words_tokens)
+    else:
+        response_generator = model.chat_stream(tokenizer, query, history=history)

-    for new_response in model.chat_stream(tokenizer, query, history):
+    for new_response in response_generator:
        if len(new_response) == current_length:
            continue

@ -187,7 +202,7 @@ def _get_args():

 if __name__ == "__main__":
    args = _get_args()
-    
+
    tokenizer = AutoTokenizer.from_pretrained(
        args.checkpoint_path, trust_remote_code=True, resume_download=True,
    )
@ -203,7 +218,7 @@ if __name__ == "__main__":
        trust_remote_code=True,
        resume_download=True,
    ).eval()
-    
+
    model.generation_config = GenerationConfig.from_pretrained(
        args.checkpoint_path, trust_remote_code=True, resume_download=True,
    )
--- a/utils.py
+++ b/utils.py
@ -0,0 +1,44 @@
+import torch
+from transformers import AutoModelForCausalLM
+from accelerate import dispatch_model
+
+
+def _device_map(num_gpus, num_layers):
+    per_gpu_layers = (num_layers + 2) / num_gpus
+
+    device_map = {
+        'transformer.wte': 0,
+        'transformer.ln_f': 0,
+        'lm_head': num_gpus-1
+    }
+
+    used = 1
+    gpu_target = 0
+    for i in range(num_layers):
+        if used >= per_gpu_layers:
+            gpu_target += 1
+            used = 0 if gpu_target < num_gpus-1 else 1
+        assert gpu_target < num_gpus
+        device_map[f'transformer.h.{i}'] = gpu_target
+        used += 1
+
+    return device_map
+
+
+def load_model_on_gpus(model_name_or_path, num_gpus: int = 2):
+    num_devices = torch.cuda.device_count()
+
+    if num_gpus == 1:
+        model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='auto',
+                                                     trust_remote_code=True).eval()
+    elif 1 < num_gpus <= num_devices:
+        model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='cpu',
+                                                     trust_remote_code=True).eval()
+        num_layers = model.config.num_hidden_layers
+        device_map = _device_map(num_gpus, num_layers)
+        print(device_map)
+        model = dispatch_model(model, device_map=device_map)
+    else:
+        raise KeyError
+
+    return model
--- a/web_demo.py
+++ b/web_demo.py
@ -4,15 +4,18 @@
 # LICENSE file in the root directory of this source tree.

 """A simple web interactive chat demo based on gradio."""
-
+import os
 from argparse import ArgumentParser

 import gradio as gr
 import mdtex2html
+
+import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationConfig

-DEFAULT_CKPT_PATH = 'QWen/QWen-7B-Chat'
+
+DEFAULT_CKPT_PATH = 'Qwen/Qwen-7B-Chat'


 def _get_args():
@ -44,17 +47,29 @@ def _load_model_tokenizer(args):
    else:
        device_map = "auto"

-    model = AutoModelForCausalLM.from_pretrained(
-        args.checkpoint_path,
-        device_map=device_map,
-        trust_remote_code=True,
-        resume_download=True,
-    ).eval()
-    model.generation_config = GenerationConfig.from_pretrained(
+    qconfig_path = os.path.join(args.checkpoint_path, 'quantize_config.json')
+    if os.path.exists(qconfig_path):
+        from auto_gptq import AutoGPTQForCausalLM
+        model = AutoGPTQForCausalLM.from_quantized(
+            args.checkpoint_path,
+            device_map=device_map,
+            trust_remote_code=True,
+            resume_download=True,
+            use_safetensors=True,
+        ).eval()
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            args.checkpoint_path,
+            device_map=device_map,
+            trust_remote_code=True,
+            resume_download=True,
+        ).eval()
+
+    config = GenerationConfig.from_pretrained(
        args.checkpoint_path, trust_remote_code=True, resume_download=True,
    )

-    return model, tokenizer
+    return model, tokenizer, config


 def postprocess(self, y):
@ -103,14 +118,14 @@ def _parse_text(text):
    return text


-def _launch_demo(args, model, tokenizer):
+def _launch_demo(args, model, tokenizer, config):

    def predict(_query, _chatbot, _task_history):
        print(f"User: {_parse_text(_query)}")
        _chatbot.append((_parse_text(_query), ""))
        full_response = ""

-        for response in model.chat_stream(tokenizer, _query, history=_task_history):
+        for response in model.chat_stream(tokenizer, _query, history=_task_history, generation_config=config):
            _chatbot[-1] = (_parse_text(_query), _parse_text(response))

            yield _chatbot
@ -131,9 +146,13 @@ def _launch_demo(args, model, tokenizer):
    def reset_user_input():
        return gr.update(value="")

-    def reset_state(_task_history):
+    def reset_state(_chatbot, _task_history):
        _task_history.clear()
-        return []
+        _chatbot.clear()
+        import gc
+        gc.collect()
+        torch.cuda.empty_cache()
+        return _chatbot

    with gr.Blocks() as demo:
        gr.Markdown("""\
@ -162,7 +181,7 @@ Qwen-7B-Chat <a href="https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary">

        submit_btn.click(predict, [query, chatbot, task_history], [chatbot], show_progress=True)
        submit_btn.click(reset_user_input, [], [query])
-        empty_btn.click(reset_state, [task_history], outputs=[chatbot], show_progress=True)
+        empty_btn.click(reset_state, [chatbot, task_history], outputs=[chatbot], show_progress=True)
        regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True)

        gr.Markdown("""\
@ -183,9 +202,9 @@ including hate speech, violence, pornography, deception, etc. \
 def main():
    args = _get_args()

-    model, tokenizer = _load_model_tokenizer(args)
+    model, tokenizer, config = _load_model_tokenizer(args)

-    _launch_demo(args, model, tokenizer)
+    _launch_demo(args, model, tokenizer, config)


 if __name__ == '__main__':