From b7767361cf892c2af5f666409d01eb89a34e0852 Mon Sep 17 00:00:00 2001 From: "tujianhong.tjh" Date: Mon, 21 Aug 2023 13:33:15 +0800 Subject: [PATCH 01/27] add example: auto_comments --- examples/auto_comments.md | 59 ++++++++++++ examples/auto_comments.py | 189 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 248 insertions(+) create mode 100644 examples/auto_comments.md create mode 100644 examples/auto_comments.py diff --git a/examples/auto_comments.md b/examples/auto_comments.md new file mode 100644 index 0000000..4aadd90 --- /dev/null +++ b/examples/auto_comments.md @@ -0,0 +1,59 @@ +# Auto Comments +本文档介绍Auto Comments,这是一个利用Qwen模型为代码文件自动生成注释的使用案例。 + +# 使用方法 +您可以直接执行如下命令,为提供的代码文件生成注释: +``` +python auto_comments.py --path 'path of file or folder' +``` + +参数: +- path:文件路径。可以是文件(目前支持python代码文件),也可以是文件夹(会扫描文件夹下所有python代码文件) +- regenerate:重新生成。默认False,如果针对同一文件需要重新生成注释,请设置为True + +# 使用样例 +- 执行:python auto_comments.py --path test_file.py +- test_file.py 内容为: +``` +import numpy as np +import pandas as pd +import seaborn as sns +sns.set_theme(style="whitegrid") + +rs = np.random.RandomState(365) +values = rs.randn(365, 4).cumsum(axis=0) +dates = pd.date_range("1 1 2016", periods=365, freq="D") +data = pd.DataFrame(values, dates, columns=["A", "B", "C", "D"]) +data = data.rolling(7).mean() + +sns.lineplot(data=data, palette="tab10", linewidth=2.5) +``` + +- 输出:test_file_comments.py(包含注释的代码文件),文件内容如下: +``` +# 导入需要的库 +import numpy as np +import pandas as pd +import seaborn as sns + +# 设置 Seaborn 的主题风格为白色网格 +sns.set_theme(style="whitegrid") + +# 生成随机数 +rs = np.random.RandomState(365) + +# 生成 365 行 4 列的随机数,并按行累加 +values = rs.randn(365, 4).cumsum(axis=0) + +# 生成日期 +dates = pd.date_range("1 1 2016", periods=365, freq="D") + +# 将随机数和日期组合成 DataFrame +data = pd.DataFrame(values, dates, columns=["A", "B", "C", "D"]) + +# 对 DataFrame 进行 7 天滑动平均 +data = data.rolling(7).mean() + +# 使用 Seaborn 绘制折线图 +sns.lineplot(data=data, palette="tab10", linewidth=2.5) +``` diff --git a/examples/auto_comments.py b/examples/auto_comments.py new file mode 100644 index 0000000..dcda959 --- /dev/null +++ b/examples/auto_comments.py @@ -0,0 +1,189 @@ +# 运行方式:python auto_comments.py --path 'path of file or folder' +# 脚本功能:使用QWen-7B-Chat为提供的代码文件自动生成注释。(详见auto_comments.md) + + +import argparse +import os +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers.generation import GenerationConfig + +MaxLine = 50 # 限制单次处理最大代码行数 +SplitKey = ["\ndef "] # 自定义的切分代码标识 +CodeFileType = ["py"] # 目前仅测试过对python文件生成注释 + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--path', type=str, default='Qwen-7B/eval/evaluate_ceval.py') + parser.add_argument('--regenerate', action='store_true', default=False) #如果已经生成过注释,默认不会重新生成 + args = parser.parse_args() + return args + +class QWenChat(): + def __init__(self): + self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True) + + # use bf16 + # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, bf16=True).eval() + # use fp16 + # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True, fp16=True).eval() + # use cpu only + # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="cpu", trust_remote_code=True).eval() + # use auto mode, automatically select precision based on the device. + self.model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B-Chat", device_map="auto", trust_remote_code=True).eval() + + # Specify hyperparameters for generation + self.model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True) + self.history = None + + def chat(self, query, system = ""): + + # use history + # response, history = self.model.chat(self.tokenizer, query, history=self.history) + + # 默认不使用history + response, history = self.model.chat(self.tokenizer, query, history=None) + self.history = history + + return response +# 生成注释 +def gen_code_comments(context, model = None, **kwargs): + prompt = "\n为以上代码生成细致的中文注释,注意使用合适的语法。要求必须在每个函数开头生成一段统一的函数功能注释。\n除了注释,请保证原始代码内容不变。不要返回除了注释和代码以外的其余信息,不要生成额外代码。\n" + return model.chat(context + prompt) + +def read_file(path): + f = open(path, "r",encoding='utf-8') + lines = f.readlines() + return "".join(lines) + +def write_file(path, context): + with open(path,'w') as f: + f.write(context) + +# 如果代码文件过长,可以简单按照最大行数切分代码 +def split_context_by_maxline(text): + lines = text.split("\n") + lines_len = len(lines) + res = [] + for i in range(MaxLine, lines_len, MaxLine): + res.append("\n".join(lines[i-MaxLine:i])) + + if i < lines_len: + res.append("\n".join(lines[i:])) + return res + +# 如果代码文件过长,可以简单按照函数切分代码 +def split_context_by_splitkey(text): + blocks = text.split(SplitKey[0]) + return [blocks[0]] + [SplitKey[0]+x for x in blocks[1:]] + +# merge原始代码和生成的注释,目的是保证原始代码不被更改。这部分可以使用各种不同的策略处理。 +def merge_code_and_comments(original_file, comments_path): + res = [] + ori_f = open(original_file, "r",encoding='utf-8') + ori_lines = ori_f.readlines() + + com_f = open(comments_path, "r",encoding='utf-8') + com_lines = com_f.readlines() + len_com_lines = len(com_lines) + p = 0 + j = 0 + for i, line in enumerate(ori_lines): + if line.isspace(): + continue + if line.strip()[0] == '#': + res.append(line) + continue + while j < len_com_lines and line[:-1] not in com_lines[j]: + j += 1 + if j < len_com_lines: + p = j - 1 + up_comments = [] + triple_dot_flag = 0 + while p < j: + if p < 0 or (res and res[-1] and com_lines[p] == res[-1]): + break + if com_lines[p].strip() and (len(com_lines[p].strip())>3 and com_lines[p].strip()[-3:] == '"""' and com_lines[p].strip()[:3] == '"""') or (len(com_lines[p].strip())>3 and com_lines[p].strip()[-3:] == "'''" and com_lines[p].strip()[:3] == "'''"): + up_comments.append(com_lines[p]) + p -= 1 + continue + if com_lines[p].strip() and (com_lines[p].strip()[-3:] == '"""' or com_lines[p].strip()[:3] == '"""' or com_lines[p].strip()[-3:] == "'''" or com_lines[p].strip()[:3] == "'''"): + triple_dot_flag = (triple_dot_flag + 1)%2 + up_comments.append(com_lines[p]) + p -= 1 + continue + if triple_dot_flag: + up_comments.append(com_lines[p]) + p -= 1 + continue + if (com_lines[p].strip()=="") or (com_lines[p].strip() and com_lines[p].strip()[0] == '#' and "省略部分内容" not in com_lines[p]): + up_comments.append(com_lines[p]) + else: + break + p -= 1 + if up_comments: + res.extend(reversed(up_comments)) + if "#" in com_lines[j] and "#" not in line: + in_line_comments = " #" + com_lines[j].split("#")[-1] + res.append(line[:-1]+in_line_comments) + else: + res.append(line) + p = j+1 + else: + res.append(line) + j = p + + write_file(comments_path, "".join(res)) + +# 处理单个文件 +def deal_one_file(model, path, args): + context = read_file(path) + + fname = path.split("/")[-1] + fpath = "/".join(path.split("/")[:-1]) + outfname = fname.split(".")[0]+"_comments."+fname.split(".")[-1] + + comments_path = os.path.join(fpath, outfname) + if (not args.regenerate) and os.path.exists(comments_path): + print("use cache: ", comments_path) + return + + context_line = len(context.split("\n")) + if context_line < MaxLine: + res = gen_code_comments(context, model = model) + elif SplitKey[0] not in context: + context_list = split_context_by_maxline(context) + res = "\n".join([gen_code_comments(context_block, model = model) for context_block in context_list]) + else: + context_list = split_context_by_splitkey(context) + res = "\n".join([gen_code_comments(context_block, model = model) for context_block in context_list]) + + write_file(comments_path, res) + merge_code_and_comments(path, comments_path) + +# 处理文件夹 +def deal_folder(model, path, args): + for fl in os.listdir(path): + now_path = os.path.join(path, fl) + if os.path.isfile(now_path): + if (now_path.split(".")[-1] in CodeFileType) and ("_comments" not in now_path): + deal_one_file(model, now_path, args) + elif os.path.isdir(now_path): + deal_folder(model, now_path, args) + else: + print("Please specify a correct path!") + +def transfer(args): + model = QWenChat() + + if os.path.isfile(args.path): + if (args.path.split(".")[-1] in CodeFileType) and ("_comments" not in args.path): + deal_one_file(model, args.path, args) + elif os.path.isdir(args.path): + deal_folder(model, args.path, args) + else: + print("Please specify a correct path!") + +if __name__ == '__main__': + args = parse_args() + print(args) + transfer(args) From 8310e255131635475d8c591e4bfa2071a29b296f Mon Sep 17 00:00:00 2001 From: yangapku Date: Mon, 21 Aug 2023 14:38:29 +0800 Subject: [PATCH 02/27] update device mapping func for supporting multi-gpu inference --- utils.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 utils.py diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..23294e8 --- /dev/null +++ b/utils.py @@ -0,0 +1,44 @@ +import torch +from transformers import AutoModelForCausalLM +from accelerate import dispatch_model + + +def _device_map(num_gpus, num_layers): + per_gpu_layers = (num_layers + 2) / num_gpus + + device_map = { + 'transformer.wte': 0, + 'transformer.ln_f': 0, + 'lm_head': num_gpus-1 + } + + used = 1 + gpu_target = 0 + for i in range(num_layers): + if used >= per_gpu_layers: + gpu_target += 1 + used = 0 if gpu_target < num_gpus-1 else 1 + assert gpu_target < num_gpus + device_map[f'transformer.h.{i}'] = gpu_target + used += 1 + + return device_map + + +def load_model_on_gpus(model_name_or_path, num_gpus: int = 2): + num_devices = torch.cuda.device_count() + + if num_gpus == 1: + model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='auto', + trust_remote_code=True).eval() + elif 1 < num_gpus <= num_devices: + model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='cpu', + trust_remote_code=True).eval() + num_layers = model.config.num_hidden_layers + device_map = _device_map(num_gpus, num_layers) + print(device_map) + model = dispatch_model(model, device_map=device_map) + else: + raise KeyError + + return model From 04f896f7d4c332ac06a9f4197874ff2a74b28287 Mon Sep 17 00:00:00 2001 From: yangapku Date: Mon, 21 Aug 2023 21:16:28 +0800 Subject: [PATCH 03/27] update new version of quantization and inference efficiency profiling result --- README.md | 147 +++++++++++++++++++++------------------------------ README_CN.md | 100 +++++++++++++---------------------- README_JA.md | 95 ++++++++++++--------------------- 3 files changed, 132 insertions(+), 210 deletions(-) diff --git a/README.md b/README.md index 2a644fd..33b70f9 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -
+span

@@ -6,7 +6,7 @@

- Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  |  Demo  |  Report   |   Discord + Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗  |  Demo  |  Report   |   Discord


@@ -27,26 +27,27 @@ Qwen-7B is the 7B-parameter version of the large language model series, Qwen (ab The following sections include information that you might find it helpful. Specifically, we advise you to read the FAQ section before you launch issues. -## News +## News and Updates -* 2023.8.3 We release both Qwen-7B and Qwen-7B-Chat on ModelScope and Hugging Face. We also provide a technical memo for more details about the model, including training details and model performance. +* 2023.8.21 We release the Int4 quantized model for Qwen-7B-Chat, **Qwen-7B-Chat-Int4**, which requires low memory costs but achieves improved inference speed. Besides, there is no significant performance degradation on the benchmark evaluation. +* 2023.8.3 We release both **Qwen-7B** and **Qwen-7B-Chat** on ModelScope and Hugging Face. We also provide a technical memo for more details about the model, including training details and model performance. ## Performance In general, Qwen-7B outperforms the baseline models of a similar model size, and even outperforms larger models of around 13B parameters, on a series of benchmark datasets, e.g., MMLU, C-Eval, GSM8K, HumanEval, and WMT22, CMMLU, etc., which evaluate the models' capabilities on natural language understanding, mathematic problem solving, coding, etc. See the results below. -| Model | MMLU | C-Eval | GSM8K | HumanEval | WMT22 (en-zh) | CMMLU | -| :---------------- | :------------: | :------------: | :------------: | :------------: | :------------: |:------------: | -| LLaMA-7B | 35.1 | - | 11.0 | 10.5 | 8.7 | - | -| LLaMA 2-7B | 45.3 | - | 14.6 | 12.8 | 17.9 | - | -| Baichuan-7B | 42.3 | 42.8 | 9.7 | 9.2 | 26.6 | 44.4 | -| ChatGLM2-6B | 47.9 | 51.7 | 32.4 | 9.2 | - | 48.8 | -| InternLM-7B | 51.0 | 52.8 | 31.2 | 10.4 | 14.8 | - | -| Baichuan-13B | 51.6 | 53.6 | 26.6 | 12.8 | 30.0 | 55.8 | -| LLaMA-13B | 46.9 | 35.5 | 17.8 | 15.8 | 12.0 | - | -| LLaMA 2-13B | 54.8 | - | 28.7 | 18.3 | 24.2 | - | -| ChatGLM2-12B | 56.2 | **61.6** | 40.9 | - | - | - | -| **Qwen-7B** | **56.7** | 59.6 | **51.6** | **24.4** | **30.6** | **58.8** | +| Model | MMLU | C-Eval | GSM8K | HumanEval | WMT22 (en-zh) | CMMLU | +| :------------- | :--------: | :--------: | :--------: | :---------: | :-------------: | :--------: | +| LLaMA-7B | 35.1 | - | 11.0 | 10.5 | 8.7 | - | +| LLaMA 2-7B | 45.3 | - | 14.6 | 12.8 | 17.9 | - | +| Baichuan-7B | 42.3 | 42.8 | 9.7 | 9.2 | 26.6 | 44.4 | +| ChatGLM2-6B | 47.9 | 51.7 | 32.4 | 9.2 | - | 48.8 | +| InternLM-7B | 51.0 | 52.8 | 31.2 | 10.4 | 14.8 | - | +| Baichuan-13B | 51.6 | 53.6 | 26.6 | 12.8 | 30.0 | 55.8 | +| LLaMA-13B | 46.9 | 35.5 | 17.8 | 15.8 | 12.0 | - | +| LLaMA 2-13B | 54.8 | - | 28.7 | 18.3 | 24.2 | - | +| ChatGLM2-12B | 56.2 | **61.6** | 40.9 | - | - | - | +| **Qwen-7B** | **56.7** | 59.6 | **51.6** | **24.4** | **30.6** | **58.8** |

@@ -195,93 +196,65 @@ Our tokenizer based on tiktoken is different from other tokenizers, e.g., senten ## Quantization -We provide examples to show how to load models in `NF4` and `Int8`. For starters, make sure you have implemented `bitsandbytes`. Note that the requirements for `bitsandbytes` are: +### Usage -``` -**Requirements** Python >=3.8. Linux distribution (Ubuntu, MacOS, etc.) + CUDA > 10.0. -``` +**Note: we provide a new solution based on [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), and release an Int4 quantized model for Qwen-7B-Chat [Click here](https://huggingface.co/Qwen/Qwen-7B-Chat-Int4), which achieves nearly lossless model effects but improved performance on both memory costs and inference speed, in comparison with the previous solution.** -Then run the following command to install `bitsandbytes`: +Here we demonstrate how to use our provided quantized models for inference. Before you start, make sure you meet the requirements of AutoGPTQ and install it from source (temporarily the codes for Qwen are not yet released in the latest version of PyPI package): +```bash +git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ +pip install . ``` -pip install bitsandbytes -``` - -Windows users should find another option, which might be [bitsandbytes-windows-webui](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels). -Then you only need to add your quantization configuration to `AutoModelForCausalLM.from_pretrained`. See the example below: +Then you can load the quantized model easily as shown below: ```python -from transformers import AutoModelForCausalLM, BitsAndBytesConfig - -# quantization configuration for NF4 (4 bits) -quantization_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type='nf4', - bnb_4bit_compute_dtype=torch.bfloat16 -) +from auto_gptq import AutoGPTQForCausalLM +model = AutoGPTQForCausalLM.from_quantized("Qwen/Qwen-7B-Chat-Int4", device_map="auto", trust_remote_code=True, use_safetensors=True).eval() +``` -# quantization configuration for Int8 (8 bits) -quantization_config = BitsAndBytesConfig(load_in_8bit=True) +To run inference, it is similar to the basic usage demonstrated above, but remember to pass in the generation configuration explicitly: -model = AutoModelForCausalLM.from_pretrained( - args.checkpoint_path, - device_map="cuda:0", - quantization_config=quantization_config, - max_memory=max_memory, - trust_remote_code=True, -).eval() +```python +from transformers import GenerationConfig +config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat-Int4", trust_remote_code=True) +response, history = model.chat(tokenizer, "Hi", history=None, generation_config=config) ``` -With this method, it is available to load Qwen-7B in `NF4` and `Int8`, which saves you memory usage. We provide related statistics of model performance below. We find that the quantization downgrades the effectiveness slightly but significantly reduces memory costs. +### Performance -| Precision | MMLU | GPU Memory for Loading Model | -| ----------- | :------: | :---------------------------: | -| BF16 | 56.7 | 16.38G | -| Int8 | 52.8 | 10.44G | -| NF4 | 48.9 | 7.79G | +We illustrate the model performance of both BF16 and Int4 models on the benchmark, and we find that the quantized model does not suffer from significant performance degradation. Results are shown below: -Note: The GPU memory usage profiling in the above table is performed on single A100-SXM4-80G GPU, PyTorch 2.0.1 and CUDA 11.8, with flash attention used. - -## Inference Efficiency +| Quantization | MMLU | CEval (val) | GSM8K | Humaneval | +| -------------- | :----: | :-----------: | :-----: | :---------: | +| BF16 | 53.9 | 54.2 | 41.1 | 24.4 | +| Int4 | 52.6 | 52.9 | 38.1 | 23.8 | ### Inference Speed -We measured the average inference speed of generating 2K tokens under BF16 precision and Int8 or NF4 quantization levels, respectively. +We measured the average inference speed (tokens/s) of generating 2048 and 8192 tokens under BF16 precision and Int4 quantization, respectively. -| Quantization Level | Inference Speed with flash_attn (tokens/s) | Inference Speed w/o flash_attn (tokens/s) | -| ---------------------- | :----------------------------------------: | :---------------------------------------: | -| BF16 (no quantization) | 30.06 | 27.55 | -| Int8 (bnb) | 7.94 | 7.86 | -| NF4 (bnb) | 21.43 | 20.37 | +| Quantization | Speed (2048 tokens) | Speed (8192 tokens) | +| -------------- | :-------------------: | :-------------------: | +| BF16 | 30.53 | 28.51 | +| Int4 | 45.60 | 33.83 | -In detail, the setting of profiling is generating 2048 new tokens with 1 context token. The profiling runs on single A100-SXM4-80G GPU with PyTorch 2.0.1 and CUDA 11.8. The inference speed is averaged over the generated 2048 tokens. +In detail, the setting of profiling is generating 8192 new tokens with 1 context token. The profiling runs on a single A100-SXM4-80G GPU with PyTorch 2.0.1 and CUDA 11.4. The inference speed is averaged over the generated 8192 tokens. ### GPU Memory Usage -We also profile the peak GPU memory usage for encoding 2048 tokens as context (and generating single token) and generating 8192 tokens (with single token as context) under BF16 or Int8/NF4 quantization levels, respectively. The results are shown below. - -When using flash attention, the memory usage is: +We also profile the peak GPU memory usage for encoding 2048 tokens as context (and generating single token) and generating 8192 tokens (with single token as context) under BF16 or Int4 quantization level, respectively. The results are shown below. -| Quantization Level | Peak Usage for Encoding 2048 Tokens | Peak Usage for Generating 8192 Tokens | -| ------------------ | :---------------------------------: | :-----------------------------------: | -| BF16 | 18.11GB | 23.52GB | -| Int8 | 12.17GB | 17.60GB | -| NF4 | 9.52GB | 14.93GB | - -When not using flash attention, the memory usage is: - -| Quantization Level | Peak Usage for Encoding 2048 Tokens | Peak Usage for Generating 8192 Tokens | -| ------------------ | :---------------------------------: | :-----------------------------------: | -| BF16 | 18.11GB | 24.40GB | -| Int8 | 12.18GB | 18.47GB | -| NF4 | 9.52GB | 15.81GB | +| Quantization | Peak Usage for Encoding 2048 Tokens | Peak Usage for Generating 8192 Tokens | +| -------------- | :-----------------------------------: | :-------------------------------------: | +| BF16 | 18.99GB | 24.40GB | +| In4 | 10.20GB | 15.61GB | The above speed and memory profiling are conducted using [this script](https://qianwen-res.oss-cn-beijing.aliyuncs.com/profile.py). ## Demo - ### Web UI We provide code for users to build a web UI demo (thanks to @wysaid). Before you start, make sure you install the following packages: @@ -371,22 +344,22 @@ print(response.choices[0].message.content) Qwen-7B-Chat is specifically optimized for tool usage, including API, database, models, etc., so that users can build their own Qwen-7B-based LangChain, Agent, and Code Interpreter. In our evaluation [benchmark](eval/EVALUATION.md) for assessing tool usage capabilities, we find that Qwen-7B reaches stable performance. -| Model | Tool Selection (Acc.↑) | Tool Input (Rouge-L↑) | False Positive Error↓ | -|:------------|:----------------------:|:----------------------:|:----------------------:| -| GPT-4 | 95% | **0.90** | 15% | -| GPT-3.5 | 85% | 0.88 | 75% | -| **Qwen-7B** | **99%** | 0.89 | **9.7%** | +| Model | Tool Selection (Acc.↑) | Tool Input (Rouge-L↑) | False Positive Error↓ | +| :------------ | :-----------------------: | :----------------------: | :----------------------: | +| GPT-4 | 95% | **0.90** | 15% | +| GPT-3.5 | 85% | 0.88 | 75% | +| **Qwen-7B** | **99%** | 0.89 | **9.7%** | For how to write and use prompts for ReAct Prompting, please refer to [the ReAct examples](examples/react_prompt.md). The use of tools can enable the model to better perform tasks. Additionally, we provide experimental results to show its capabilities of playing as an agent. See [Hugging Face Agent](https://huggingface.co/docs/transformers/transformers_agents) for more information. Its performance on the run-mode benchmark provided by Hugging Face is as follows: -| Model | Tool Selection↑ | Tool Used↑ | Code↑ | -|:---------------|:---------------:|:-----------:|:---------:| -|GPT-4 | **100** | **100** | **97.41** | -|GPT-3.5 | 95.37 | 96.30 | 87.04 | -|StarCoder-15.5B | 87.04 | 87.96 | 68.89 | -| **Qwen-7B** | 90.74 | 92.59 | 74.07 | +| Model | Tool Selection↑ | Tool Used↑ | Code↑ | +| :---------------- | :----------------: | :-----------: | :---------: | +| GPT-4 | **100** | **100** | **97.41** | +| GPT-3.5 | 95.37 | 96.30 | 87.04 | +| StarCoder-15.5B | 87.04 | 87.96 | 68.89 | +| **Qwen-7B** | 90.74 | 92.59 | 74.07 | ## Long-Context Understanding diff --git a/README_CN.md b/README_CN.md index af4d8f9..5e00be4 100644 --- a/README_CN.md +++ b/README_CN.md @@ -6,7 +6,7 @@

- Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  |  Demo  |  Report   |   Discord + Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗  |  Demo  |  Report   |   Discord


@@ -29,6 +29,8 @@ ## 新闻 +* 2023年8月21日 发布Qwen-7B-Chat的Int4量化模型,Qwen-7B-Chat-Int4。该模型显存占用低,推理速度相比半精度模型显著提升,在基准评测上效果损失较小。 + * 2023年8月3日 在魔搭社区(ModelScope)和Hugging Face同步推出Qwen-7B和Qwen-7B-Chat模型。同时,我们发布了技术备忘录,介绍了相关的训练细节和模型表现。 ## 评测表现 @@ -198,89 +200,62 @@ print(f'Response: {response}') ## 量化 -如希望使用更低精度的量化模型,如4比特和8比特的模型,我们提供了简单的示例来说明如何快速使用量化模型。在开始前,确保你已经安装了`bitsandbytes`。请注意,`bitsandbytes`的安装要求是: +### 用法 -``` -**Requirements** Python >=3.8. Linux distribution (Ubuntu, MacOS, etc.) + CUDA > 10.0. -``` +**请注意:我们更新量化方案为基于[AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)的量化,提供Qwen-7B-Chat的Int4量化模型[点击这里](https://huggingface.co/Qwen/Qwen-7B-Chat-Int4)。相比此前方案,该方案在模型评测效果几乎无损,且存储需求更低,推理速度更优。** -随后运行如下命令安装`bitsandbytes`: +以下我们提供示例说明如何使用Int4量化模型。在开始使用前,请先保证满足AutoGPTQ的要求,并使用源代码安装(由于最新支持Qwen的代码未发布到PyPI): +```bash +git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ +pip install . ``` -pip install bitsandbytes -``` - -Windows用户需安装特定版本的`bitsandbytes`,可选项包括[bitsandbytes-windows-webui](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels)。 -你只需要在`AutoModelForCausalLM.from_pretrained`中添加你的量化配置,即可使用量化模型。如下所示: +随后便能轻松读取量化模型: ```python -from transformers import AutoModelForCausalLM, BitsAndBytesConfig - -# quantization configuration for NF4 (4 bits) -quantization_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type='nf4', - bnb_4bit_compute_dtype=torch.bfloat16 -) +from auto_gptq import AutoGPTQForCausalLM +model = AutoGPTQForCausalLM.from_quantized("Qwen/Qwen-7B-Chat-Int4", device_map="auto", trust_remote_code=True, use_safetensors=True).eval() +``` -# quantization configuration for Int8 (8 bits) -quantization_config = BitsAndBytesConfig(load_in_8bit=True) +推理方法和基础用法类似,但注意需要从外部传入generation config: -model = AutoModelForCausalLM.from_pretrained( - args.checkpoint_path, - device_map="cuda:0", - quantization_config=quantization_config, - max_memory=max_memory, - trust_remote_code=True, -).eval() +```python +from transformers import GenerationConfig +config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat-Int4", trust_remote_code=True) +response, history = model.chat(tokenizer, "Hi", history=None, generation_config=config) ``` -上述方法可以让我们将模型量化成`NF4`和`Int8`精度的模型进行读取,帮助我们节省显存开销。我们也提供了相关性能数据。我们发现尽管模型在效果上存在损失,但模型的显存开销大幅降低。 +### 效果评测 -| Precision | MMLU | GPU Memory for Loading Model | -| ----------- | :------: | :---------------------------: | -| BF16 | 56.7 | 16.38G | -| Int8 | 52.8 | 10.44G | -| NF4 | 48.9 | 7.79G | +我们对BF16和Int4模型在基准评测上做了测试,发现量化模型效果损失较小,结果如下所示: -注:表中显存占用的测试环境为A100-SXM4-80G单卡,PyTorch 2.0.1,CUDA 11.8,开启flash attention - -## 推理性能 +| Quantization | MMLU | CEval (val) | GSM8K | Humaneval | +| ------------- | :--------: | :----------: | :----: | :--------: | +| BF16 | 53.9 | 54.2 | 41.1 | 24.4 | +| Int4 | 52.6 | 52.9 | 38.1 | 23.8 | ### 推理速度 -我们分别测试了BF16和量化条件下,模型生成2K tokens的平均推理速度,结果如下 - -| 量化等级 | 开flash_attn的推理速度 (字符/秒) | 关flash_attn的推理速度 (字符/秒) | -| ------ | :---------------------------: | :---------------------------: | -| BF16 (无量化) | 30.06 | 27.55 | -| Int8 (bnb) | 7.94 | 7.86 | -| NF4 (bnb) | 21.43 | 20.37 | +我们测算了BF16和Int4模型生成2048和8192个token的平均推理速度(tokens/s)。如图所示: -具体的评测方式为:指定输入context长度为1,生成长度为2048;测试硬件为A100-SXM4-80G单卡,软件环境为PyTorch 2.0.1,CUDA版本11.8,计算生成该2048序列的平均速度 +| Quantization | Speed (2048 tokens) | Speed (8192 tokens) | +| ------------- | :------------------:| :------------------:| +| BF16 | 30.53 | 28.51 | +| Int4 | 45.60 | 33.83 | -### 显存占用 +具体而言,我们记录在长度为1的上下文的条件下生成8192个token的性能。评测运行于单张A100-SXM4-80G GPU,使用PyTorch 2.0.1和CUDA 11.4。推理速度是生成8192个token的速度均值。 -在BF16和不同量化条件下,我们分别测算了模型编码2048长度序列(并生成1个token),和生成8192长度序列(编码1个token作为context)的峰值显存占用。结果如下 +### 显存使用 -打开flash attention时 +我们还测算了BF16和Int4模型编码2048个token及生成8192个token的峰值显存占用情况。结果如下所示: -| 量化等级 | 编码 2048 长度的峰值显存 | 生成 8192 长度的峰值显存 | -| --- | :---: | :---: | -| BF16 | 18.11GB | 23.52GB | -| Int8 | 12.17GB | 17.60GB | -| NF4 | 9.52GB | 14.93GB | +| Quantization Level | Peak Usage for Encoding 2048 Tokens | Peak Usage for Generating 8192 Tokens | +| ------------------ | :---------------------------------: | :-----------------------------------: | +| BF16 | 18.99GB | 24.40GB | +| In4 | 10.20GB | 15.61GB | -关闭flash attention时 - -| 量化等级 | 编码 2048 长度的峰值显存 | 生成 8192 长度的峰值显存 | -| --- | :---: | :---: | -| BF16 | 18.11GB | 24.40GB | -| Int8 | 12.18GB | 18.47GB | -| NF4 | 9.52GB | 15.81GB | - -以上测速和显存占用情况,均可通过该[评测脚本](https://qianwen-res.oss-cn-beijing.aliyuncs.com/profile.py)测算得到。 +上述性能测算使用[此脚本](https://qianwen-res.oss-cn-beijing.aliyuncs.com/profile.py)完成。 ## Demo @@ -304,7 +279,6 @@ python web_demo.py

- ### 交互式Demo 我们提供了一个简单的交互式Demo示例,请查看`cli_demo.py`。当前模型已经支持流式输出,用户可通过输入文字的方式和Qwen-7B-Chat交互,模型将流式输出返回结果。运行如下命令: diff --git a/README_JA.md b/README_JA.md index f178493..008037c 100644 --- a/README_JA.md +++ b/README_JA.md @@ -6,7 +6,7 @@

- Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  |  Demo  |  Report   |   Discord + Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗  |  Demo  |  Report   |   Discord


@@ -33,6 +33,8 @@ Qwen-7Bは、アリババクラウドが提唱する大規模言語モデルシ ## ニュース +* 2023.8.21 Qwen-7B-Chat 用 Int4 量子化モデル(**Qwen-7B-Chat-Int4**)をリリースしました。メモリコストは低いが、推論速度は向上している。また、ベンチマーク評価において大きな性能劣化はありません。 + * 2023.8.3 Qwen-7B と Qwen-7B-Chat を ModelScope と Hugging Face で公開。また、トレーニングの詳細やモデルの性能など、モデルの詳細についてはテクニカルメモを提供しています。 ## パフォーマンス @@ -199,89 +201,62 @@ tiktoken に基づくトークナイザーは、他のトークナイザー、 ## 量子化 -`NF4` と `Int8` のモデルをロードする方法を示す例を提供します。手始めに、`bitsandbytes` が実装されていることを確認して下さい。`bitsandbytes` の要件は以下の通りになります: +### 使用方法 -``` -**必要条件** Python >= 3.8。Linux ディストリビューション(Ubuntu、MacOS など)+ CUDA > 10.0。 -``` +**注:[AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)に基づく新しい解決策を提供し、Qwen-7B-Chat用のInt4量子化モデル[ここをクリック](https://huggingface.co/Qwen/Qwen-7B-Chat-Int4)をリリースしました。このモデルは、従来の解決策と比較して、ほぼ無損失のモデル効果を達成しつつ、メモリコストと推論速度の両方で性能が向上しています**。 -そして、以下のコマンドを実行して `bitsandbytes` をインストールする: +ここでは、量子化されたモデルを推論に使用する方法を示します。始める前に、AutoGPTQの要件を満たしていることを確認し、ソースからインストールしてください(一時的にQwenのコードは最新版のPyPIパッケージではまだリリースされていません): +```bash +git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ +pip install . ``` -pip install bitsandbytes -``` - -Windows ユーザは、[bitsandbytes-windows-webui](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels) という別のオプションを見つける必要があります。 -そして、量子化の設定を `AutoModelForCausalLM.from_pretrained` に追加するだけとなります。以下の例を参照してください: +そうすれば、以下のように簡単に量子化モデルを読み込むことができる。 ```python -from transformers import AutoModelForCausalLM, BitsAndBytesConfig - -# NF4(4ビット)の量子化設定 -quantization_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type='nf4', - bnb_4bit_compute_dtype=torch.bfloat16 -) +from auto_gptq import AutoGPTQForCausalLM +model = AutoGPTQForCausalLM.from_quantized("Qwen/Qwen-7B-Chat-Int4", device_map="auto", trust_remote_code=True, use_safetensors=True).eval() +``` -# Int8(8ビット)の量子化設定 -quantization_config = BitsAndBytesConfig(load_in_8bit=True) +推論を実行するには、上で示した基本的な使い方に似ているが、generation configurationを明示的に渡すことを忘れないこと: -model = AutoModelForCausalLM.from_pretrained( - args.checkpoint_path, - device_map="cuda:0", - quantization_config=quantization_config, - max_memory=max_memory, - trust_remote_code=True, -).eval() +```python +from transformers import GenerationConfig +config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat-Int4", trust_remote_code=True) +response, history = model.chat(tokenizer, "Hi", history=None, generation_config=config) ``` -この方法では、Qwen-7B を `NF4` と `Int8` でロードすることができ、メモリ使用量を節約できる。以下にモデル性能の関連統計量を示します。量子化により、有効性は若干低下するが、推論効率は大幅に向上し、メモリコストが削減されることがわかります。 +### 性能 -| Precision | MMLU | GPU Memory for Loading Model | -| ----------- | :------: | :---------------------------: | -| BF16 | 56.7 | 16.38G | -| Int8 | 52.8 | 10.44G | -| NF4 | 48.9 | 7.79G | +ベンチマークにおけるBF16モデルとInt4モデルの性能について説明する。結果を以下に示します: -注:上表のGPUメモリ使用量プロファイリングは、シングルA100-SXM4-80G GPU、PyTorch 2.0.1、CUDA 11.8、フラッシュアテンション使用で実行されています。 - -## 推論効率 +| Quantization | MMLU | CEval (val) | GSM8K | Humaneval | +| ------------- | :--------: | :----------: | :----: | :--------: | +| BF16 | 53.9 | 54.2 | 41.1 | 24.4 | +| Int4 | 52.6 | 52.9 | 38.1 | 23.8 | ### 推論スピード -BF16精度、量子化レベルInt8またはNF4で、それぞれ2Kトークンを生成する平均推論速度を測定した。 +BF16の精度とInt4の量子化レベルの下で、それぞれ2048個と8192個のトークンを生成する平均推論速度(tokens/s)を測定した。 -| Quantization Level | Inference Speed with flash_attn (tokens/s) | Inference Speed w/o flash_attn (tokens/s) | -| ------ | :---------------------------: | :---------------------------: | -| BF16 (no quantization) | 30.06 | 27.55 | -| Int8 (bnb) | 7.94 | 7.86 | -| NF4 (bnb) | 21.43 | 20.37 | +| Quantization | Speed (2048 tokens) | Speed (8192 tokens) | +| ------------- | :------------------:| :------------------:| +| BF16 | 30.53 | 28.51 | +| Int4 | 45.60 | 33.83 | -詳細には、プロファイリングの設定は、1コンテクスト・トークンで2048の新しいトークンを生成している。プロファイリングは、PyTorch 2.0.1とCUDA 11.8を搭載したシングルA100-SXM4-80G GPUで実行される。推論速度は生成された2048個のトークンの平均です。 +詳細には、プロファイリングの設定は、1コンテクスト・トークンで8192個の新しいトークンを生成している。プロファイリングは、PyTorch 2.0.1とCUDA 11.4を搭載したシングルA100-SXM4-80G GPUで実行される。推論速度は生成された8192個のトークンの平均値です。 ### GPUメモリ使用量 -また、BF16またはInt8/NF4量子化レベルの下で、2048個のトークンをコンテキストとしてエンコードした場合(および単一のトークンを生成した場合)と、8192個のトークンを生成した場合(単一のトークンをコンテキストとして生成した場合)のGPUメモリ使用量のピーク値をそれぞれプロファイリングしました。結果を以下に示す。 - -Flash attentionを使用した場合のメモリ使用量は以下の通りである: - -| Quantization Level | Peak Usage for Encoding 2048 Tokens | Peak Usage for Generating 8192 Tokens | -| --- | :---: | :---: | -| BF16 | 18.11GB | 23.52GB | -| Int8 | 12.17GB | 17.60GB | -| NF4 | 9.52GB | 14.93GB | - -Flash attentionを使用しない場合、メモリ使用量は次のようになる: +また、BF16またはInt4の量子化レベルで、それぞれ2048トークンをコンテキストとしてエンコードした場合(および単一のトークンを生成した場合)と、8192トークンを生成した場合(単一のトークンをコンテキストとして生成した場合)のGPUメモリ使用量のピーク値をプロファイリングしました。その結果を以下に示します。 | Quantization Level | Peak Usage for Encoding 2048 Tokens | Peak Usage for Generating 8192 Tokens | -| --- | :---: | :---: | -| BF16 | 18.11GB | 24.40GB | -| Int8 | 12.18GB | 18.47GB | -| NF4 | 9.52GB | 15.81GB | +| ------------------ | :---------------------------------: | :-----------------------------------: | +| BF16 | 18.99GB | 24.40GB | +| In4 | 10.20GB | 15.61GB | -上記のスピードとメモリーのプロファイリングは、[このスクリプト](https://qianwen-res.oss-cn-beijing.aliyuncs.com/profile.py)を使って行われた。 +上記のスピードとメモリーのプロファイリングは、[このスクリプト](https://qianwen-res.oss-cn-beijing.aliyuncs.com/profile.py)を使用しています。 ## デモ From cc1b252c8b4fd56d818c592006453dfaeba67774 Mon Sep 17 00:00:00 2001 From: Junyang Lin Date: Mon, 21 Aug 2023 21:22:55 +0800 Subject: [PATCH 04/27] Update README.md fix --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 33b70f9..48f355c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -span +

From f0ec7f75252e1b246280d5a3ceaafd03b9fcdb99 Mon Sep 17 00:00:00 2001 From: Junyang Lin Date: Mon, 21 Aug 2023 21:22:55 +0800 Subject: [PATCH 05/27] Update README.md fix --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 33b70f9..9d59298 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -span +

From 6446fe04370acd6ab2b8f4734cbf7844f0412f80 Mon Sep 17 00:00:00 2001 From: Yang An Date: Tue, 22 Aug 2023 08:41:30 +0800 Subject: [PATCH 06/27] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9d59298..ac38122 100644 --- a/README.md +++ b/README.md @@ -249,7 +249,7 @@ We also profile the peak GPU memory usage for encoding 2048 tokens as context (a | Quantization | Peak Usage for Encoding 2048 Tokens | Peak Usage for Generating 8192 Tokens | | -------------- | :-----------------------------------: | :-------------------------------------: | | BF16 | 18.99GB | 24.40GB | -| In4 | 10.20GB | 15.61GB | +| Int4 | 10.20GB | 15.61GB | The above speed and memory profiling are conducted using [this script](https://qianwen-res.oss-cn-beijing.aliyuncs.com/profile.py). From a9985474094237f5d274f6c28c31278297fe274f Mon Sep 17 00:00:00 2001 From: Yang An Date: Tue, 22 Aug 2023 08:41:58 +0800 Subject: [PATCH 07/27] Update README_CN.md --- README_CN.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_CN.md b/README_CN.md index 5e00be4..deea87e 100644 --- a/README_CN.md +++ b/README_CN.md @@ -253,7 +253,7 @@ response, history = model.chat(tokenizer, "Hi", history=None, generation_config= | Quantization Level | Peak Usage for Encoding 2048 Tokens | Peak Usage for Generating 8192 Tokens | | ------------------ | :---------------------------------: | :-----------------------------------: | | BF16 | 18.99GB | 24.40GB | -| In4 | 10.20GB | 15.61GB | +| Int4 | 10.20GB | 15.61GB | 上述性能测算使用[此脚本](https://qianwen-res.oss-cn-beijing.aliyuncs.com/profile.py)完成。 From ce5f0e18c6a55a6aa0c4ac74f0f238239230e67f Mon Sep 17 00:00:00 2001 From: Yang An Date: Tue, 22 Aug 2023 08:42:13 +0800 Subject: [PATCH 08/27] Update README_JA.md --- README_JA.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_JA.md b/README_JA.md index 008037c..3eff97f 100644 --- a/README_JA.md +++ b/README_JA.md @@ -254,7 +254,7 @@ BF16の精度とInt4の量子化レベルの下で、それぞれ2048個と8192 | Quantization Level | Peak Usage for Encoding 2048 Tokens | Peak Usage for Generating 8192 Tokens | | ------------------ | :---------------------------------: | :-----------------------------------: | | BF16 | 18.99GB | 24.40GB | -| In4 | 10.20GB | 15.61GB | +| Int4 | 10.20GB | 15.61GB | 上記のスピードとメモリーのプロファイリングは、[このスクリプト](https://qianwen-res.oss-cn-beijing.aliyuncs.com/profile.py)を使用しています。 From 4ae8a9f3404644c62929ef94be4535091668b4f0 Mon Sep 17 00:00:00 2001 From: Junyang Lin Date: Wed, 23 Aug 2023 01:24:35 +0800 Subject: [PATCH 09/27] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ac38122..34d5f85 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@

- Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗  |  Demo  |  Report   |   Discord + Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗  |  Demo  |  Report   |   Discord


From 10cbe0dd4ce824179f60a01abb92e17bbcf9fdb5 Mon Sep 17 00:00:00 2001 From: Junyang Lin Date: Wed, 23 Aug 2023 01:24:59 +0800 Subject: [PATCH 10/27] Update README_CN.md --- README_CN.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_CN.md b/README_CN.md index deea87e..dea4cc8 100644 --- a/README_CN.md +++ b/README_CN.md @@ -6,7 +6,7 @@

- Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗  |  Demo  |  Report   |   Discord + Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗  |  Demo  |  Report   |   Discord


From e2dca374284bc8890c7e03cad79431f88d2e6e69 Mon Sep 17 00:00:00 2001 From: Junyang Lin Date: Wed, 23 Aug 2023 01:25:49 +0800 Subject: [PATCH 11/27] Update README_JA.md --- README_JA.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_JA.md b/README_JA.md index 3eff97f..a8ded47 100644 --- a/README_JA.md +++ b/README_JA.md @@ -6,7 +6,7 @@

- Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗  |  Demo  |  Report   |   Discord + Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗  |  Demo  |  Report   |   Discord


From 4b6a2f01704b166f56aff2a6ff867ee58a266609 Mon Sep 17 00:00:00 2001 From: Junyang Lin Date: Wed, 23 Aug 2023 01:28:49 +0800 Subject: [PATCH 12/27] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 34d5f85..1205e3b 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@

- Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗  |  Demo  |  Report   |   Discord + Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗  |  Demo  |  Report   |   Discord


From ba651d9b03c42ad33ebea7c5d87c5d2408becdc3 Mon Sep 17 00:00:00 2001 From: Junyang Lin Date: Wed, 23 Aug 2023 01:29:07 +0800 Subject: [PATCH 13/27] Update README_CN.md --- README_CN.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_CN.md b/README_CN.md index dea4cc8..811d5e2 100644 --- a/README_CN.md +++ b/README_CN.md @@ -6,7 +6,7 @@

- Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗  |  Demo  |  Report   |   Discord + Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗  |  Demo  |  Report   |   Discord


From e494489bf1b5bdc3d1d5c6dd3ac5f7647e950064 Mon Sep 17 00:00:00 2001 From: Junyang Lin Date: Wed, 23 Aug 2023 01:29:49 +0800 Subject: [PATCH 14/27] Update README_JA.md --- README_JA.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_JA.md b/README_JA.md index a8ded47..828229f 100644 --- a/README_JA.md +++ b/README_JA.md @@ -6,7 +6,7 @@

- Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗  |  Demo  |  Report   |   Discord + Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗  |  Demo  |  Report   |   Discord


From a3a5b3de4774888e2f605129806226405070b44e Mon Sep 17 00:00:00 2001 From: cyente Date: Wed, 23 Aug 2023 16:30:53 +0800 Subject: [PATCH 15/27] add stop word on openai api ChatCompletion --- README.md | 6 ++++-- README_CN.md | 4 +++- openai_api.py | 27 +++++++++++++++++++++------ 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 1205e3b..63cc127 100644 --- a/README.md +++ b/README.md @@ -318,7 +318,8 @@ for chunk in openai.ChatCompletion.create( messages=[ {"role": "user", "content": "你好"} ], - stream=True + stream=True + # Specifying stop words in streaming output format is not yet supported and is under development. ): if hasattr(chunk.choices[0].delta, "content"): print(chunk.choices[0].delta.content, end="", flush=True) @@ -329,7 +330,8 @@ response = openai.ChatCompletion.create( messages=[ {"role": "user", "content": "你好"} ], - stream=False + stream=False, + stop=[] # You can add custom stop words here, e.g., stop=["Observation:"] for ReAct prompting. ) print(response.choices[0].message.content) ``` diff --git a/README_CN.md b/README_CN.md index 811d5e2..4764fe4 100644 --- a/README_CN.md +++ b/README_CN.md @@ -323,6 +323,7 @@ for chunk in openai.ChatCompletion.create( {"role": "user", "content": "你好"} ], stream=True + # 流式输出的自定义stopwords功能尚未支持,正在开发中 ): if hasattr(chunk.choices[0].delta, "content"): print(chunk.choices[0].delta.content, end="", flush=True) @@ -333,7 +334,8 @@ response = openai.ChatCompletion.create( messages=[ {"role": "user", "content": "你好"} ], - stream=False + stream=False, + stop=[] # 在此处添加自定义的stop words 例如ReAct prompting时需要增加: stop=["Observation:"]。 ) print(response.choices[0].message.content) ``` diff --git a/openai_api.py b/openai_api.py index da105f3..52da00b 100644 --- a/openai_api.py +++ b/openai_api.py @@ -68,6 +68,7 @@ class ChatCompletionRequest(BaseModel): top_p: Optional[float] = None max_length: Optional[int] = None stream: Optional[bool] = False + stop: Optional[List[str]] = [] class ChatCompletionResponseChoice(BaseModel): @@ -103,7 +104,8 @@ async def create_chat_completion(request: ChatCompletionRequest): if request.messages[-1].role != "user": raise HTTPException(status_code=400, detail="Invalid request") query = request.messages[-1].content - + stop_words = request.stop + stop_words.extend(list(map(lambda x: x[1:], filter(lambda x: x.startswith("\n"), stop_words)))) prev_messages = request.messages[:-1] # Temporarily, the system role does not work as expected. We advise that you write the setups for role-play in your query. # if len(prev_messages) > 0 and prev_messages[0].role == "system": @@ -120,10 +122,18 @@ async def create_chat_completion(request: ChatCompletionRequest): raise HTTPException(status_code=400, detail="Invalid request.") if request.stream: - generate = predict(query, history, request.model) + generate = predict(query, history, request.model, stop_words) return EventSourceResponse(generate, media_type="text/event-stream") - response, _ = model.chat(tokenizer, query, history=history) + if stop_words: + react_stop_words_tokens = [tokenizer.encode(stop_) for stop_ in stop_words] + response, _ = model.chat(tokenizer, query, history=history, stop_words_ids=react_stop_words_tokens) + for stop_ in stop_words: + if response.endswith(stop_): + response = response[:response.find(stop_)] + else: + response, _ = model.chat(tokenizer, query, history=history) + choice_data = ChatCompletionResponseChoice( index=0, message=ChatMessage(role="assistant", content=response), @@ -133,9 +143,9 @@ async def create_chat_completion(request: ChatCompletionRequest): return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion") -async def predict(query: str, history: List[List[str]], model_id: str): +async def predict(query: str, history: List[List[str]], model_id: str, stop_words: List[str]): global model, tokenizer - + assert stop_words == [], "in stream format, stop word is output" choice_data = ChatCompletionResponseStreamChoice( index=0, delta=DeltaMessage(role="assistant"), @@ -145,8 +155,13 @@ async def predict(query: str, history: List[List[str]], model_id: str): yield "{}".format(chunk.model_dump_json(exclude_unset=True)) current_length = 0 + if stop_words: + react_stop_words_tokens = [tokenizer.encode(stop_) for stop_ in stop_words] + response_generator = model.chat_stream(tokenizer, query, history=history, stop_words_ids=react_stop_words_tokens) + else: + response_generator = model.chat_stream(tokenizer, query, history=history) - for new_response in model.chat_stream(tokenizer, query, history): + for new_response in response_generator: if len(new_response) == current_length: continue From 562537e65afa8e3a94a61f74a79e2f948c360dbf Mon Sep 17 00:00:00 2001 From: JustinLin610 Date: Thu, 24 Aug 2023 12:05:19 +0800 Subject: [PATCH 16/27] update web demo --- README_JA.md | 1 - web_demo.py | 38 +++++++++++++++++++++++++------------- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/README_JA.md b/README_JA.md index 828229f..7bcc629 100644 --- a/README_JA.md +++ b/README_JA.md @@ -34,7 +34,6 @@ Qwen-7Bは、アリババクラウドが提唱する大規模言語モデルシ ## ニュース * 2023.8.21 Qwen-7B-Chat 用 Int4 量子化モデル(**Qwen-7B-Chat-Int4**)をリリースしました。メモリコストは低いが、推論速度は向上している。また、ベンチマーク評価において大きな性能劣化はありません。 - * 2023.8.3 Qwen-7B と Qwen-7B-Chat を ModelScope と Hugging Face で公開。また、トレーニングの詳細やモデルの性能など、モデルの詳細についてはテクニカルメモを提供しています。 ## パフォーマンス diff --git a/web_demo.py b/web_demo.py index bd25f7f..e94f6be 100755 --- a/web_demo.py +++ b/web_demo.py @@ -4,7 +4,7 @@ # LICENSE file in the root directory of this source tree. """A simple web interactive chat demo based on gradio.""" - +import os from argparse import ArgumentParser import gradio as gr @@ -44,17 +44,29 @@ def _load_model_tokenizer(args): else: device_map = "auto" - model = AutoModelForCausalLM.from_pretrained( - args.checkpoint_path, - device_map=device_map, - trust_remote_code=True, - resume_download=True, - ).eval() - model.generation_config = GenerationConfig.from_pretrained( + qconfig_path = os.path.join(args.checkpoint_path, 'quantize_config.json') + if os.path.exists(qconfig_path): + from auto_gptq import AutoGPTQForCausalLM + model = AutoGPTQForCausalLM.from_quantized( + args.checkpoint_path, + device_map=device_map, + trust_remote_code=True, + resume_download=True, + use_safetensors=True, + ).eval() + else: + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, + device_map=device_map, + trust_remote_code=True, + resume_download=True, + ).eval() + + config = GenerationConfig.from_pretrained( args.checkpoint_path, trust_remote_code=True, resume_download=True, ) - return model, tokenizer + return model, tokenizer, config def postprocess(self, y): @@ -103,14 +115,14 @@ def _parse_text(text): return text -def _launch_demo(args, model, tokenizer): +def _launch_demo(args, model, tokenizer, config): def predict(_query, _chatbot, _task_history): print(f"User: {_parse_text(_query)}") _chatbot.append((_parse_text(_query), "")) full_response = "" - for response in model.chat_stream(tokenizer, _query, history=_task_history): + for response in model.chat_stream(tokenizer, _query, history=_task_history, generation_config=config): _chatbot[-1] = (_parse_text(_query), _parse_text(response)) yield _chatbot @@ -183,9 +195,9 @@ including hate speech, violence, pornography, deception, etc. \ def main(): args = _get_args() - model, tokenizer = _load_model_tokenizer(args) + model, tokenizer, config = _load_model_tokenizer(args) - _launch_demo(args, model, tokenizer) + _launch_demo(args, model, tokenizer, config) if __name__ == '__main__': From 5c49d35a7e33a531075168729bade716389ad6a8 Mon Sep 17 00:00:00 2001 From: yangapku Date: Thu, 24 Aug 2023 14:14:30 +0800 Subject: [PATCH 17/27] format line --- openai_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openai_api.py b/openai_api.py index 52da00b..42b0841 100644 --- a/openai_api.py +++ b/openai_api.py @@ -202,7 +202,7 @@ def _get_args(): if __name__ == "__main__": args = _get_args() - + tokenizer = AutoTokenizer.from_pretrained( args.checkpoint_path, trust_remote_code=True, resume_download=True, ) @@ -218,7 +218,7 @@ if __name__ == "__main__": trust_remote_code=True, resume_download=True, ).eval() - + model.generation_config = GenerationConfig.from_pretrained( args.checkpoint_path, trust_remote_code=True, resume_download=True, ) From d0cc30be23b23bcf80b8110b25e70fa2c569efec Mon Sep 17 00:00:00 2001 From: yangapku Date: Fri, 25 Aug 2023 14:52:39 +0800 Subject: [PATCH 18/27] update README --- README.md | 6 ++++-- README_CN.md | 5 +++-- README_JA.md | 4 +++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 63cc127..bcc0a2f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -
+span

@@ -6,7 +6,9 @@

- Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗  |  Demo  |  Report   |   Discord + Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗 +
+Demo  |  Report   |   Discord  |  WeChat


diff --git a/README_CN.md b/README_CN.md index 4764fe4..24b190c 100644 --- a/README_CN.md +++ b/README_CN.md @@ -6,7 +6,9 @@

- Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗  |  Demo  |  Report   |   Discord + Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗 +
+Demo  |  Report   |   Discord  |  WeChat


@@ -30,7 +32,6 @@ ## 新闻 * 2023年8月21日 发布Qwen-7B-Chat的Int4量化模型,Qwen-7B-Chat-Int4。该模型显存占用低,推理速度相比半精度模型显著提升,在基准评测上效果损失较小。 - * 2023年8月3日 在魔搭社区(ModelScope)和Hugging Face同步推出Qwen-7B和Qwen-7B-Chat模型。同时,我们发布了技术备忘录,介绍了相关的训练细节和模型表现。 ## 评测表现 diff --git a/README_JA.md b/README_JA.md index 7bcc629..a871160 100644 --- a/README_JA.md +++ b/README_JA.md @@ -6,7 +6,9 @@

- Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗  |  Demo  |  Report   |   Discord + Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗 +
+Demo  |  Report   |   Discord  |  WeChat


From 6f5f076ad13ee0f92976d22b170263b98de3d2ee Mon Sep 17 00:00:00 2001 From: yangapku Date: Fri, 25 Aug 2023 14:54:14 +0800 Subject: [PATCH 19/27] update README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bcc0a2f..81d7ac6 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -span +

From dad3b3a408052152c59ee6057f564f7a362627c6 Mon Sep 17 00:00:00 2001 From: yangapku Date: Fri, 25 Aug 2023 15:35:29 +0800 Subject: [PATCH 20/27] update README --- README.md | 12 +++++------- README_CN.md | 14 +++++++------- README_JA.md | 22 ++++++++++------------ 3 files changed, 22 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 81d7ac6..bf4999a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,7 @@ -
+

+ 中文  |  English  |  日本語 +

+

@@ -8,12 +11,7 @@

Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗
-Demo  |  Report   |   Discord  |  WeChat -

-
- -

- 中文  |  English  |  日本語 +WeChat   |   Discord   |   Demo  |  Report



diff --git a/README_CN.md b/README_CN.md index 24b190c..65bcf45 100644 --- a/README_CN.md +++ b/README_CN.md @@ -1,4 +1,7 @@ -
+

+ 中文  |  English  |  日本語 +

+

@@ -8,15 +11,12 @@

Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗
-Demo  |  Report   |   Discord  |  WeChat -

-
- -

- 中文  |  English  |  日本語 +WeChat   |   Discord   |   Demo  |  Report



+ + 我们在🤖 **ModelScope**以及🤗 **Hugging Face**均开源了**Qwen-7B**系列模型。请在本文档顶部点击相关链接查看仓库信息。本仓库主要包括Qwen-7B的简介、使用指南、技术备忘等内容。想了解更多关于模型的信息,请点击[链接](tech_memo.md)查看我们的技术备忘录。 通义千问-7B(Qwen-7B) 是阿里云研发的通义千问大模型系列的70亿参数规模的模型。Qwen-7B是基于Transformer的大语言模型, 在超大规模的预训练数据上进行训练得到。预训练数据类型多样,覆盖广泛,包括大量网络文本、专业书籍、代码等。同时,在Qwen-7B的基础上,我们使用对齐机制打造了基于大语言模型的AI助手Qwen-7B-Chat。Qwen-7B系列模型的特点包括: diff --git a/README_JA.md b/README_JA.md index a871160..eddffac 100644 --- a/README_JA.md +++ b/README_JA.md @@ -1,23 +1,21 @@ -
+

+ 中文  |  English  |  日本語 +

+

-
-

- Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗 -
-Demo  |  Report   |   Discord  |  WeChat +

+ Japanese document maintainer: Ikko Eltociear Ashimine

-
+

- 中文  |  English  |  日本語 -

-

-

- Japanese document maintainer: Ikko Eltociear Ashimine + Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗 +
+WeChat   |   Discord   |   Demo  |  Report



From 5dbbd1025b5a3d9b51335937d4a45cd048bbe423 Mon Sep 17 00:00:00 2001 From: yangapku Date: Fri, 25 Aug 2023 15:48:07 +0800 Subject: [PATCH 21/27] update readme --- README.md | 2 +- README_CN.md | 2 +- README_JA.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index bf4999a..059fa6c 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@

- Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗 + Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗
WeChat   |   Discord   |   Demo  |  Report

diff --git a/README_CN.md b/README_CN.md index 65bcf45..e6a83c6 100644 --- a/README_CN.md +++ b/README_CN.md @@ -9,7 +9,7 @@

- Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗 + Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗
WeChat   |   Discord   |   Demo  |  Report

diff --git a/README_JA.md b/README_JA.md index eddffac..31a3170 100644 --- a/README_JA.md +++ b/README_JA.md @@ -13,7 +13,7 @@

- Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗 + Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗
WeChat   |   Discord   |   Demo  |  Report

From 1a9a04a91ee99ad394e3c5281b46e028d4be1c55 Mon Sep 17 00:00:00 2001 From: yangapku Date: Fri, 25 Aug 2023 18:12:11 +0800 Subject: [PATCH 22/27] update readme --- README_JA.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README_JA.md b/README_JA.md index 31a3170..cddf84f 100644 --- a/README_JA.md +++ b/README_JA.md @@ -7,17 +7,17 @@

-

- Japanese document maintainer: Ikko Eltociear Ashimine -

-

-

Qwen-7B 🤖 | 🤗  | Qwen-7B-Chat 🤖 | 🤗  | Qwen-7B-Chat-Int4 🤗
WeChat   |   Discord   |   Demo  |  Report

-

+
+ +

+ Japanese document maintainer: Ikko Eltociear Ashimine +

+
私たちは、**Qwen-7B** と **Qwen-7B-Chat** を **🤖 ModelScope** と **🤗 Hugging Face** の両方でオープンソース化しています(上部のロゴをクリックすると、コードとチェックポイントのあるリポジトリに移動します)。このレポには、Qwen-7B の簡単な紹介と、使い方の手引き、さらに詳しい情報を提供する技術メモ [link](tech_memo.md) が含まれています。 From 4864f7b27849398c975be8ea06024f0e313fc7f4 Mon Sep 17 00:00:00 2001 From: "feihu.hf" Date: Fri, 25 Aug 2023 22:44:07 +0800 Subject: [PATCH 23/27] fix format problems in evaluation code; update ceval extraction rules --- eval/EVALUATION.md | 13 + eval/evaluate_ceval.py | 407 ++++++++++++++++++++++---------- eval/evaluate_chat_ceval.py | 387 +++++++++++++++++++++--------- eval/evaluate_chat_gsm8k.py | 108 +++++---- eval/evaluate_chat_humaneval.py | 97 +++++--- eval/evaluate_chat_mmlu.py | 247 +++++++++++++------ eval/evaluate_cmmlu.py | 320 ++++++++++++++----------- eval/evaluate_gsm8k.py | 91 ++++--- eval/evaluate_humaneval.py | 75 +++--- eval/evaluate_mmlu.py | 275 ++++++++++++++------- eval/evaluate_plugin.py | 301 ++++++++++++----------- 11 files changed, 1510 insertions(+), 811 deletions(-) diff --git a/eval/EVALUATION.md b/eval/EVALUATION.md index 44e0af6..1381e69 100644 --- a/eval/EVALUATION.md +++ b/eval/EVALUATION.md @@ -34,6 +34,19 @@ pip install thefuzz python evaluate_chat_mmlu.py -d data/mmlu/data/ ``` +- CMMLU + +```Shell +wget https://huggingface.co/datasets/haonan-li/cmmlu/resolve/main/cmmlu_v1_0_1.zip +mkdir data/cmmlu +mv cmmlu_v1_0_1.zip data/cmmlu +cd data/cmmlu; unzip cmmlu_v1_0_1.zip +cd ../../ + +# Qwen-7B +python evaluate_cmmlu.py -d data/cmmlu/ +``` + - HumanEval Get the HumanEval.jsonl file from [here](https://github.com/openai/human-eval/tree/master/data) diff --git a/eval/evaluate_ceval.py b/eval/evaluate_ceval.py index e1616a5..a6618cf 100644 --- a/eval/evaluate_ceval.py +++ b/eval/evaluate_ceval.py @@ -1,14 +1,13 @@ import os -import pandas as pd -import numpy as np +from typing import List import argparse -import datasets import torch - -from typing import List +import pandas as pd +import numpy as np from tqdm import tqdm from transformers.trainer_utils import set_seed - +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers.generation import GenerationConfig ''' wget https://huggingface.co/datasets/ceval/ceval-exam/resolve/main/ceval-exam.zip @@ -20,29 +19,32 @@ python evaluate_ceval.py -d data/ceval/ ''' def load_models_tokenizer(args): - from transformers import AutoModelForCausalLM, AutoTokenizer - from transformers.generation import GenerationConfig - - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval() - model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, device_map="auto", trust_remote_code=True + ).eval() + model.generation_config = GenerationConfig.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) return model, tokenizer def format_example(line, include_answer=True): - example = '问题:' + line['question'] + example = "问题:" + line["question"] for choice in choices: example += f'\n{choice}. {line[f"{choice}"]}' - + if include_answer: - example += '\n答案:' + line["answer"] + '\n\n' + example += "\n答案:" + line["answer"] + "\n\n" else: - example += '\n答案:' + example += "\n答案:" return example def generate_few_shot_prompt(k, subject, dev_df): - prompt = '' + prompt = "" if k == -1: k = dev_df.shape[0] for i in range(k): @@ -54,35 +56,37 @@ def generate_few_shot_prompt(k, subject, dev_df): def get_logits(tokenizer, model, inputs: List[str]): - input_ids = tokenizer(inputs, padding=False)['input_ids'] + input_ids = tokenizer(inputs, padding=False)["input_ids"] input_ids = torch.tensor(input_ids, device=model.device) - tokens = {'input_ids': input_ids} + tokens = {"input_ids": input_ids} - outputs = model(input_ids)['logits'] + outputs = model(input_ids)["logits"] logits = outputs[:, -1, :] log_probs = torch.nn.functional.softmax(logits, dim=-1) - return log_probs, {'tokens': tokens} + return log_probs, {"tokens": tokens} @torch.no_grad() def eval_subject( - model, - tokenizer, - subject_name, - test_df, - k=5, - dev_df=None, - few_shot=False, - save_result_dir=None, - **kwargs + model, + tokenizer, + subject_name, + test_df, + k=5, + dev_df=None, + few_shot=False, + save_result_dir=None, + **kwargs, ): result = [] score = [] - few_shot_prompt = generate_few_shot_prompt( - k, subject_name, dev_df) if few_shot else '' - all_probs = {'prob_A': [], 'prob_B': [], 'prob_C': [], 'prob_D': []} - if args.debug: print(f"few_shot_prompt: {few_shot_prompt}") + few_shot_prompt = ( + generate_few_shot_prompt(k, subject_name, dev_df) if few_shot else "" + ) + all_probs = {"prob_A": [], "prob_B": [], "prob_C": [], "prob_D": []} + if args.debug: + print(f"few_shot_prompt: {few_shot_prompt}") for _, row in tqdm(test_df.iterrows(), total=len(test_df)): question = format_example(row, include_answer=False) @@ -93,44 +97,49 @@ def eval_subject( logits = output.flatten() softval = torch.nn.functional.softmax( - torch.tensor( - [ - logits[tokenizer("A")['input_ids']], - logits[tokenizer("B")['input_ids']], - logits[tokenizer("C")['input_ids']], - logits[tokenizer("D")['input_ids']], - ] - ), - dim=0, - ) + torch.tensor( + [ + logits[tokenizer("A")["input_ids"]], + logits[tokenizer("B")["input_ids"]], + logits[tokenizer("C")["input_ids"]], + logits[tokenizer("D")["input_ids"]], + ] + ), + dim=0, + ) if softval.dtype in {torch.bfloat16, torch.float16}: softval = softval.to(dtype=torch.float32) probs = softval.detach().cpu().numpy() for i, choice in enumerate(choices): - all_probs[f'prob_{choice}'].append(probs[i]) + all_probs[f"prob_{choice}"].append(probs[i]) pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)] - - if 'answer' in row: - correct = 1 if pred == row['answer'] else 0 + + if "answer" in row: + correct = 1 if pred == row["answer"] else 0 score.append(correct) - if args.debug: print(f'{question} pred: {pred} ref: {row["answer"]}') + if args.debug: + print(f'{question} pred: {pred} ref: {row["answer"]}') result.append(pred) if score: correct_ratio = 100 * sum(score) / len(score) - if args.debug: print(subject_name, correct_ratio) + if args.debug: + print(subject_name, correct_ratio) else: correct_ratio = 0 if save_result_dir: - test_df['model_output'] = result + test_df["model_output"] = result for i, choice in enumerate(choices): - test_df[f'prob_{choice}'] = (all_probs[f'prob_{choice}']) + test_df[f"prob_{choice}"] = all_probs[f"prob_{choice}"] if score: test_df["correctness"] = score os.makedirs(save_result_dir, exist_ok=True) - test_df.to_csv(os.path.join( - save_result_dir, f'{subject_name}_result.csv'), encoding="utf-8", index=False) + test_df.to_csv( + os.path.join(save_result_dir, f"{subject_name}_result.csv"), + encoding="utf-8", + index=False, + ) return correct_ratio @@ -139,125 +148,285 @@ def cal_ceval(res): acc_sum_dict = dict() acc_norm_sum_dict = dict() cnt_dict = dict() - acc_sum = 0. + acc_sum = 0.0 cnt = 0 hard_cnt = 0 - hard_acc_sum = 0. + hard_acc_sum = 0.0 for tt in res.keys(): - name = tt.split('-')[-1] + name = tt.split("-")[-1] acc_sum += float(res[tt]) cnt += 1 class_ = TASK_NAME_MAPPING[name][2] if class_ not in acc_sum_dict: - acc_sum_dict[class_] = 0. - acc_norm_sum_dict[class_] = 0. - cnt_dict[class_] = 0. + acc_sum_dict[class_] = 0.0 + acc_norm_sum_dict[class_] = 0.0 + cnt_dict[class_] = 0.0 if name in hard_list: hard_cnt += 1 hard_acc_sum += float(res[tt]) acc_sum_dict[class_] += float(res[tt]) cnt_dict[class_] += 1 - print('\n\n\n') - for k in ['STEM', 'Social Science', 'Humanities', 'Other']: + print("\n\n\n") + for k in ["STEM", "Social Science", "Humanities", "Other"]: if k in cnt_dict: - print('%s acc: %.2f ' % ( - k, acc_sum_dict[k] / cnt_dict[k])) + print("%s acc: %.2f " % (k, acc_sum_dict[k] / cnt_dict[k])) if hard_cnt > 0: - print('Hard acc:%.2f ' % (hard_acc_sum / hard_cnt)) - print('AVERAGE acc:%.2f ' % (acc_sum / cnt)) + print("Hard acc:%.2f " % (hard_acc_sum / hard_cnt)) + print("AVERAGE acc:%.2f " % (acc_sum / cnt)) TASK_NAME_MAPPING = { "computer_network": ["Computer Network", "\u8ba1\u7b97\u673a\u7f51\u7edc", "STEM"], "operating_system": ["Operating System", "\u64cd\u4f5c\u7cfb\u7edf", "STEM"], - "computer_architecture": ["Computer Architecture", "\u8ba1\u7b97\u673a\u7ec4\u6210", "STEM"], + "computer_architecture": [ + "Computer Architecture", + "\u8ba1\u7b97\u673a\u7ec4\u6210", + "STEM", + ], "college_programming": ["College Programming", "\u5927\u5b66\u7f16\u7a0b", "STEM"], "college_physics": ["College Physics", "\u5927\u5b66\u7269\u7406", "STEM"], "college_chemistry": ["College Chemistry", "\u5927\u5b66\u5316\u5b66", "STEM"], - "advanced_mathematics": ["Advanced Mathematics", "\u9ad8\u7b49\u6570\u5b66", "STEM"], - "probability_and_statistics": ["Probability and Statistics", "\u6982\u7387\u7edf\u8ba1", "STEM"], - "discrete_mathematics": ["Discrete Mathematics", "\u79bb\u6563\u6570\u5b66", "STEM"], - "electrical_engineer": ["Electrical Engineer", "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08", "STEM"], - "metrology_engineer": ["Metrology Engineer", "\u6ce8\u518c\u8ba1\u91cf\u5e08", "STEM"], - "high_school_mathematics": ["High School Mathematics", "\u9ad8\u4e2d\u6570\u5b66", "STEM"], + "advanced_mathematics": [ + "Advanced Mathematics", + "\u9ad8\u7b49\u6570\u5b66", + "STEM", + ], + "probability_and_statistics": [ + "Probability and Statistics", + "\u6982\u7387\u7edf\u8ba1", + "STEM", + ], + "discrete_mathematics": [ + "Discrete Mathematics", + "\u79bb\u6563\u6570\u5b66", + "STEM", + ], + "electrical_engineer": [ + "Electrical Engineer", + "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08", + "STEM", + ], + "metrology_engineer": [ + "Metrology Engineer", + "\u6ce8\u518c\u8ba1\u91cf\u5e08", + "STEM", + ], + "high_school_mathematics": [ + "High School Mathematics", + "\u9ad8\u4e2d\u6570\u5b66", + "STEM", + ], "high_school_physics": ["High School Physics", "\u9ad8\u4e2d\u7269\u7406", "STEM"], - "high_school_chemistry": ["High School Chemistry", "\u9ad8\u4e2d\u5316\u5b66", "STEM"], + "high_school_chemistry": [ + "High School Chemistry", + "\u9ad8\u4e2d\u5316\u5b66", + "STEM", + ], "high_school_biology": ["High School Biology", "\u9ad8\u4e2d\u751f\u7269", "STEM"], - "middle_school_mathematics": ["Middle School Mathematics", "\u521d\u4e2d\u6570\u5b66", "STEM"], - "middle_school_biology": ["Middle School Biology", "\u521d\u4e2d\u751f\u7269", "STEM"], - "middle_school_physics": ["Middle School Physics", "\u521d\u4e2d\u7269\u7406", "STEM"], - "middle_school_chemistry": ["Middle School Chemistry", "\u521d\u4e2d\u5316\u5b66", "STEM"], + "middle_school_mathematics": [ + "Middle School Mathematics", + "\u521d\u4e2d\u6570\u5b66", + "STEM", + ], + "middle_school_biology": [ + "Middle School Biology", + "\u521d\u4e2d\u751f\u7269", + "STEM", + ], + "middle_school_physics": [ + "Middle School Physics", + "\u521d\u4e2d\u7269\u7406", + "STEM", + ], + "middle_school_chemistry": [ + "Middle School Chemistry", + "\u521d\u4e2d\u5316\u5b66", + "STEM", + ], "veterinary_medicine": ["Veterinary Medicine", "\u517d\u533b\u5b66", "STEM"], - "college_economics": ["College Economics", "\u5927\u5b66\u7ecf\u6d4e\u5b66", "Social Science"], - "business_administration": ["Business Administration", "\u5de5\u5546\u7ba1\u7406", "Social Science"], - "marxism": ["Marxism", "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406", "Social Science"], - "mao_zedong_thought": ["Mao Zedong Thought", "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba", "Social Science"], + "college_economics": [ + "College Economics", + "\u5927\u5b66\u7ecf\u6d4e\u5b66", + "Social Science", + ], + "business_administration": [ + "Business Administration", + "\u5de5\u5546\u7ba1\u7406", + "Social Science", + ], + "marxism": [ + "Marxism", + "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406", + "Social Science", + ], + "mao_zedong_thought": [ + "Mao Zedong Thought", + "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba", + "Social Science", + ], "education_science": ["Education Science", "\u6559\u80b2\u5b66", "Social Science"], - "teacher_qualification": ["Teacher Qualification", "\u6559\u5e08\u8d44\u683c", "Social Science"], - "high_school_politics": ["High School Politics", "\u9ad8\u4e2d\u653f\u6cbb", "Social Science"], - "high_school_geography": ["High School Geography", "\u9ad8\u4e2d\u5730\u7406", "Social Science"], - "middle_school_politics": ["Middle School Politics", "\u521d\u4e2d\u653f\u6cbb", "Social Science"], - "middle_school_geography": ["Middle School Geography", "\u521d\u4e2d\u5730\u7406", "Social Science"], - "modern_chinese_history": ["Modern Chinese History", "\u8fd1\u4ee3\u53f2\u7eb2\u8981", "Humanities"], - "ideological_and_moral_cultivation": ["Ideological and Moral Cultivation", "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840", "Humanities"], + "teacher_qualification": [ + "Teacher Qualification", + "\u6559\u5e08\u8d44\u683c", + "Social Science", + ], + "high_school_politics": [ + "High School Politics", + "\u9ad8\u4e2d\u653f\u6cbb", + "Social Science", + ], + "high_school_geography": [ + "High School Geography", + "\u9ad8\u4e2d\u5730\u7406", + "Social Science", + ], + "middle_school_politics": [ + "Middle School Politics", + "\u521d\u4e2d\u653f\u6cbb", + "Social Science", + ], + "middle_school_geography": [ + "Middle School Geography", + "\u521d\u4e2d\u5730\u7406", + "Social Science", + ], + "modern_chinese_history": [ + "Modern Chinese History", + "\u8fd1\u4ee3\u53f2\u7eb2\u8981", + "Humanities", + ], + "ideological_and_moral_cultivation": [ + "Ideological and Moral Cultivation", + "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840", + "Humanities", + ], "logic": ["Logic", "\u903b\u8f91\u5b66", "Humanities"], "law": ["Law", "\u6cd5\u5b66", "Humanities"], - "chinese_language_and_literature": ["Chinese Language and Literature", "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66", "Humanities"], + "chinese_language_and_literature": [ + "Chinese Language and Literature", + "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66", + "Humanities", + ], "art_studies": ["Art Studies", "\u827a\u672f\u5b66", "Humanities"], - "professional_tour_guide": ["Professional Tour Guide", "\u5bfc\u6e38\u8d44\u683c", "Humanities"], - "legal_professional": ["Legal Professional", "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c", "Humanities"], - "high_school_chinese": ["High School Chinese", "\u9ad8\u4e2d\u8bed\u6587", "Humanities"], - "high_school_history": ["High School History", "\u9ad8\u4e2d\u5386\u53f2", "Humanities"], - "middle_school_history": ["Middle School History", "\u521d\u4e2d\u5386\u53f2", "Humanities"], + "professional_tour_guide": [ + "Professional Tour Guide", + "\u5bfc\u6e38\u8d44\u683c", + "Humanities", + ], + "legal_professional": [ + "Legal Professional", + "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c", + "Humanities", + ], + "high_school_chinese": [ + "High School Chinese", + "\u9ad8\u4e2d\u8bed\u6587", + "Humanities", + ], + "high_school_history": [ + "High School History", + "\u9ad8\u4e2d\u5386\u53f2", + "Humanities", + ], + "middle_school_history": [ + "Middle School History", + "\u521d\u4e2d\u5386\u53f2", + "Humanities", + ], "civil_servant": ["Civil Servant", "\u516c\u52a1\u5458", "Other"], "sports_science": ["Sports Science", "\u4f53\u80b2\u5b66", "Other"], "plant_protection": ["Plant Protection", "\u690d\u7269\u4fdd\u62a4", "Other"], "basic_medicine": ["Basic Medicine", "\u57fa\u7840\u533b\u5b66", "Other"], "clinical_medicine": ["Clinical Medicine", "\u4e34\u5e8a\u533b\u5b66", "Other"], - "urban_and_rural_planner": ["Urban and Rural Planner", "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08", "Other"], + "urban_and_rural_planner": [ + "Urban and Rural Planner", + "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08", + "Other", + ], "accountant": ["Accountant", "\u6ce8\u518c\u4f1a\u8ba1\u5e08", "Other"], - "fire_engineer": ["Fire Engineer", "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08", "Other"], - "environmental_impact_assessment_engineer": ["Environmental Impact Assessment Engineer", "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08", "Other"], + "fire_engineer": [ + "Fire Engineer", + "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08", + "Other", + ], + "environmental_impact_assessment_engineer": [ + "Environmental Impact Assessment Engineer", + "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08", + "Other", + ], "tax_accountant": ["Tax Accountant", "\u7a0e\u52a1\u5e08", "Other"], - "physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"] + "physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"], } -hard_list = ['advanced_mathematics', 'discrete_mathematics', 'probability_and_statistics', 'college_physics', 'college_chemistry', 'high_school_mathematics', 'high_school_physics', 'high_school_chemistry'] +hard_list = [ + "advanced_mathematics", + "discrete_mathematics", + "probability_and_statistics", + "college_physics", + "college_chemistry", + "high_school_mathematics", + "high_school_physics", + "high_school_chemistry", +] choices = ["A", "B", "C", "D"] def main(args): model, tokenizer = load_models_tokenizer(args) - + dev_result = {} for subject_name in tqdm(TASK_NAME_MAPPING.keys()): - val_file_path = os.path.join(args.eval_data_path, 'val', f'{subject_name}_val.csv') - dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}_dev.csv') + val_file_path = os.path.join( + args.eval_data_path, "val", f"{subject_name}_val.csv" + ) + dev_file_path = os.path.join( + args.eval_data_path, "dev", f"{subject_name}_dev.csv" + ) # test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}_test.csv') val_df = pd.read_csv(val_file_path) dev_df = pd.read_csv(dev_file_path) # test_df = pd.read_csv(test_file_path) - score = eval_subject(model, tokenizer, subject_name, val_df, dev_df=dev_df, k=5, few_shot=True, - save_result_dir=f"outs/ceval_eval_result") + score = eval_subject( + model, + tokenizer, + subject_name, + val_df, + dev_df=dev_df, + k=5, + few_shot=True, + save_result_dir=f"outs/ceval_eval_result", + ) dev_result[subject_name] = score cal_ceval(dev_result) -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B") - parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed') - - """Provide extra arguments required for tasks.""" - group = parser.add_argument_group(title='Evaluation options') - group.add_argument('-d', '--eval_data_path', type=str, required=True, - help='Path to eval data') - group.add_argument("--max-seq-len", type=int, default=2048, - help='Size of the output generated text.') - group.add_argument("--debug", action='store_true', default=False, - help='Print infos.') +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=str, + help="Checkpoint path", + default="Qwen/Qwen-7B", + ) + parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed") + + # Provide extra arguments required for tasks + group = parser.add_argument_group(title="Evaluation options") + group.add_argument( + "-d", "--eval_data_path", type=str, required=True, help="Path to eval data" + ) + group.add_argument( + "--max-seq-len", + type=int, + default=2048, + help="Size of the output generated text.", + ) + group.add_argument( + "--debug", action="store_true", default=False, help="Print infos." + ) args = parser.parse_args() set_seed(args.seed) - main(args) \ No newline at end of file + main(args) diff --git a/eval/evaluate_chat_ceval.py b/eval/evaluate_chat_ceval.py index 10d5b27..b909a6d 100644 --- a/eval/evaluate_chat_ceval.py +++ b/eval/evaluate_chat_ceval.py @@ -1,14 +1,13 @@ import os -import pandas as pd -import numpy as np import argparse -import datasets -import torch import re +import torch +import pandas as pd from thefuzz import process -from typing import List from tqdm import tqdm from transformers.trainer_utils import set_seed +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers.generation import GenerationConfig ''' wget https://huggingface.co/datasets/ceval/ceval-exam/resolve/main/ceval-exam.zip @@ -22,13 +21,16 @@ python eval/evaluate_chat_ceval.py -d data/ceval ''' def load_models_tokenizer(args): - from transformers import AutoModelForCausalLM, AutoTokenizer - from transformers.generation import GenerationConfig - - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True, bf16=True, use_flash_attn=True).eval() - model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) - model.generation_config.do_sample = False # use greedy decoding + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, device_map="auto", trust_remote_code=True + ).eval() + model.generation_config = GenerationConfig.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model.generation_config.do_sample = False # use greedy decoding return model, tokenizer def process_before_extraction(gen, question, choice_dict): @@ -57,20 +59,28 @@ def process_before_extraction(gen, question, choice_dict): gen = gen.replace(val.rstrip("。"), key) return gen + def count_substr(gen, pattern): return len(re.findall(pattern, gen)) + def extract_choice(gen, prompt, choice_list): # 答案是A | 选项是A | 应该选A选项 - res = re.search(r"(?:(?:选|选择|选定)|(?:(?:答案|选项)(?![^ABCD]{0,10}?(?:不|非)[^ABCD]{0,10}?(?:是|为|:|:|】))[^ABCD]{0,10}?(?:是|为|:|:|】))[^ABCD]{0,10}?)(A|B|C|D)(?:选项)?(?:\)|。|\.|,|,|.|、|A|B|C|D|$)", gen) - + res = re.search( + r"(?:(?:选|选择|选定)[::]?\s*|(?:(?:答案|选项)(?![^ABCD]{0,10}?(?:不|非)[^ABCD]{0,10}?(?:是|选|为|:|:|】))[^ABCD]{0,10}?(?:是|选|为|:|:|】))[^ABCD]{0,10}?)(A|B|C|D)(?:选项)?(?:\)|。|\.|,|,|.|、|A|B|C|D|$|:|:|\)|))", + gen, + ) + # A选项正确 | A选项符合题意 if res is None: - res = re.search(r"(A|B|C|D)(?:选?项)?(?![^ABCD]{0,4}?(?:不|非)[^ABCD]{0,4}?(?:正确|对|符合))[^ABCD]{0,4}?(?:正确|对|符合)", gen) + res = re.search( + r"(A|B|C|D)(?:选?项)?(?![^ABCD]{0,4}?(?:不|非)[^ABCD]{0,4}?(?:正确|对[的,。:]|符合))[^ABCD]{0,4}?(?:正确|对[的,。:]|符合)", + gen, + ) # 直接输出 A if res is None: - res = re.search(r"^(A|B|C|D)(?:。|\.|,|,|.|$)", gen) + res = re.search(r"^[\((]?(A|B|C|D)(?:。|\)|)|\.|,|,|.|:|:|$)", gen) # 获取第一个出现的字母 if res is None: @@ -78,41 +88,46 @@ def extract_choice(gen, prompt, choice_list): if res is None: return choices[choice_list.index(process.extractOne(gen, choice_list)[0])] - else: - return res.group(1) + return res.group(1) + def format_example(line): - example = line['question'] + "\n\n" + example = line["question"] + "\n\n" for choice in choices: - example += f'{choice}. {line[f"{choice}"]}\n' + example += f'{choice}. {line[f"{choice}"]}\n' return example + def extract_answer(response, row): - prompt = row['question'] - gen = process_before_extraction(response, prompt, {choice: row[choice] for choice in choices}) + prompt = row["question"] + gen = process_before_extraction( + response, prompt, {choice: row[choice] for choice in choices} + ) if not isinstance(prompt, str): prompt = prompt[0] pred = extract_choice(gen, prompt, [row[choice] for choice in choices]) return pred + @torch.no_grad() def eval_subject( - model, - tokenizer, - subject_name, - test_df, - save_result_dir=None, - overwrite=False, - **kwargs + model, + tokenizer, + subject_name, + test_df, + save_result_dir=None, + overwrite=False, + **kwargs ): - - result_path = os.path.join(save_result_dir, f'{subject_name}_result.csv') + result_path = os.path.join(save_result_dir, f"{subject_name}_result.csv") if not overwrite and os.path.exists(result_path): print(f"{result_path} existed, skip!") score = [] - for (_, datarow), (_, resultrow) in zip(test_df.iterrows(), pd.read_csv(result_path).iterrows()): - pred = extract_answer(resultrow['model_response'], datarow) - correct = 1 if pred == datarow['answer'] else 0 + for (_, datarow), (_, resultrow) in zip( + test_df.iterrows(), pd.read_csv(result_path).iterrows() + ): + pred = extract_answer(resultrow["model_response"], datarow) + correct = 1 if pred == datarow["answer"] else 0 score.append(correct) correct_ratio = 100 * sum(score) / len(score) return correct_ratio @@ -124,7 +139,7 @@ def eval_subject( for _, row in tqdm(test_df.iterrows(), total=len(test_df)): question = format_example(row) - response, history = model.chat( + response, _ = model.chat( tokenizer, question, history=None, @@ -134,22 +149,24 @@ def eval_subject( pred = extract_answer(response, row) print(pred) print("======================") - - if 'answer' in row: - correct = 1 if pred == row['answer'] else 0 + + if "answer" in row: + correct = 1 if pred == row["answer"] else 0 score.append(correct) - if args.debug: print(f'{question} pred: {pred} ref: {row["answer"]}') + if args.debug: + print(f'{question} pred: {pred} ref: {row["answer"]}') responses.append(response) result.append(pred) if score: correct_ratio = 100 * sum(score) / len(score) - if args.debug: print(subject_name, correct_ratio) + if args.debug: + print(subject_name, correct_ratio) else: correct_ratio = 0 if save_result_dir: - test_df['model_response'] = responses - test_df['model_output'] = result + test_df["model_response"] = responses + test_df["model_output"] = result if score: test_df["correctness"] = score os.makedirs(save_result_dir, exist_ok=True) @@ -162,89 +179,225 @@ def cal_ceval(res): acc_sum_dict = dict() acc_norm_sum_dict = dict() cnt_dict = dict() - acc_sum = 0. + acc_sum = 0.0 cnt = 0 hard_cnt = 0 - hard_acc_sum = 0. + hard_acc_sum = 0.0 for tt in res.keys(): - name = tt.split('-')[-1] + name = tt.split("-")[-1] acc_sum += float(res[tt]) cnt += 1 class_ = TASK_NAME_MAPPING[name][2] if class_ not in acc_sum_dict: - acc_sum_dict[class_] = 0. - acc_norm_sum_dict[class_] = 0. - cnt_dict[class_] = 0. + acc_sum_dict[class_] = 0.0 + acc_norm_sum_dict[class_] = 0.0 + cnt_dict[class_] = 0.0 if name in hard_list: hard_cnt += 1 hard_acc_sum += float(res[tt]) acc_sum_dict[class_] += float(res[tt]) cnt_dict[class_] += 1 - print('\n\n\n') - for k in ['STEM', 'Social Science', 'Humanities', 'Other']: + print("\n\n\n") + for k in ["STEM", "Social Science", "Humanities", "Other"]: if k in cnt_dict: - print('%s acc: %.2f ' % ( - k, acc_sum_dict[k] / cnt_dict[k])) + print("%s acc: %.2f " % (k, acc_sum_dict[k] / cnt_dict[k])) if hard_cnt > 0: - print('Hard acc:%.2f ' % (hard_acc_sum / hard_cnt)) - print('AVERAGE acc:%.2f ' % (acc_sum / cnt)) + print("Hard acc:%.2f " % (hard_acc_sum / hard_cnt)) + print("AVERAGE acc:%.2f " % (acc_sum / cnt)) TASK_NAME_MAPPING = { "computer_network": ["Computer Network", "\u8ba1\u7b97\u673a\u7f51\u7edc", "STEM"], "operating_system": ["Operating System", "\u64cd\u4f5c\u7cfb\u7edf", "STEM"], - "computer_architecture": ["Computer Architecture", "\u8ba1\u7b97\u673a\u7ec4\u6210", "STEM"], + "computer_architecture": [ + "Computer Architecture", + "\u8ba1\u7b97\u673a\u7ec4\u6210", + "STEM", + ], "college_programming": ["College Programming", "\u5927\u5b66\u7f16\u7a0b", "STEM"], "college_physics": ["College Physics", "\u5927\u5b66\u7269\u7406", "STEM"], "college_chemistry": ["College Chemistry", "\u5927\u5b66\u5316\u5b66", "STEM"], - "advanced_mathematics": ["Advanced Mathematics", "\u9ad8\u7b49\u6570\u5b66", "STEM"], - "probability_and_statistics": ["Probability and Statistics", "\u6982\u7387\u7edf\u8ba1", "STEM"], - "discrete_mathematics": ["Discrete Mathematics", "\u79bb\u6563\u6570\u5b66", "STEM"], - "electrical_engineer": ["Electrical Engineer", "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08", "STEM"], - "metrology_engineer": ["Metrology Engineer", "\u6ce8\u518c\u8ba1\u91cf\u5e08", "STEM"], - "high_school_mathematics": ["High School Mathematics", "\u9ad8\u4e2d\u6570\u5b66", "STEM"], + "advanced_mathematics": [ + "Advanced Mathematics", + "\u9ad8\u7b49\u6570\u5b66", + "STEM", + ], + "probability_and_statistics": [ + "Probability and Statistics", + "\u6982\u7387\u7edf\u8ba1", + "STEM", + ], + "discrete_mathematics": [ + "Discrete Mathematics", + "\u79bb\u6563\u6570\u5b66", + "STEM", + ], + "electrical_engineer": [ + "Electrical Engineer", + "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08", + "STEM", + ], + "metrology_engineer": [ + "Metrology Engineer", + "\u6ce8\u518c\u8ba1\u91cf\u5e08", + "STEM", + ], + "high_school_mathematics": [ + "High School Mathematics", + "\u9ad8\u4e2d\u6570\u5b66", + "STEM", + ], "high_school_physics": ["High School Physics", "\u9ad8\u4e2d\u7269\u7406", "STEM"], - "high_school_chemistry": ["High School Chemistry", "\u9ad8\u4e2d\u5316\u5b66", "STEM"], + "high_school_chemistry": [ + "High School Chemistry", + "\u9ad8\u4e2d\u5316\u5b66", + "STEM", + ], "high_school_biology": ["High School Biology", "\u9ad8\u4e2d\u751f\u7269", "STEM"], - "middle_school_mathematics": ["Middle School Mathematics", "\u521d\u4e2d\u6570\u5b66", "STEM"], - "middle_school_biology": ["Middle School Biology", "\u521d\u4e2d\u751f\u7269", "STEM"], - "middle_school_physics": ["Middle School Physics", "\u521d\u4e2d\u7269\u7406", "STEM"], - "middle_school_chemistry": ["Middle School Chemistry", "\u521d\u4e2d\u5316\u5b66", "STEM"], + "middle_school_mathematics": [ + "Middle School Mathematics", + "\u521d\u4e2d\u6570\u5b66", + "STEM", + ], + "middle_school_biology": [ + "Middle School Biology", + "\u521d\u4e2d\u751f\u7269", + "STEM", + ], + "middle_school_physics": [ + "Middle School Physics", + "\u521d\u4e2d\u7269\u7406", + "STEM", + ], + "middle_school_chemistry": [ + "Middle School Chemistry", + "\u521d\u4e2d\u5316\u5b66", + "STEM", + ], "veterinary_medicine": ["Veterinary Medicine", "\u517d\u533b\u5b66", "STEM"], - "college_economics": ["College Economics", "\u5927\u5b66\u7ecf\u6d4e\u5b66", "Social Science"], - "business_administration": ["Business Administration", "\u5de5\u5546\u7ba1\u7406", "Social Science"], - "marxism": ["Marxism", "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406", "Social Science"], - "mao_zedong_thought": ["Mao Zedong Thought", "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba", "Social Science"], + "college_economics": [ + "College Economics", + "\u5927\u5b66\u7ecf\u6d4e\u5b66", + "Social Science", + ], + "business_administration": [ + "Business Administration", + "\u5de5\u5546\u7ba1\u7406", + "Social Science", + ], + "marxism": [ + "Marxism", + "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406", + "Social Science", + ], + "mao_zedong_thought": [ + "Mao Zedong Thought", + "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba", + "Social Science", + ], "education_science": ["Education Science", "\u6559\u80b2\u5b66", "Social Science"], - "teacher_qualification": ["Teacher Qualification", "\u6559\u5e08\u8d44\u683c", "Social Science"], - "high_school_politics": ["High School Politics", "\u9ad8\u4e2d\u653f\u6cbb", "Social Science"], - "high_school_geography": ["High School Geography", "\u9ad8\u4e2d\u5730\u7406", "Social Science"], - "middle_school_politics": ["Middle School Politics", "\u521d\u4e2d\u653f\u6cbb", "Social Science"], - "middle_school_geography": ["Middle School Geography", "\u521d\u4e2d\u5730\u7406", "Social Science"], - "modern_chinese_history": ["Modern Chinese History", "\u8fd1\u4ee3\u53f2\u7eb2\u8981", "Humanities"], - "ideological_and_moral_cultivation": ["Ideological and Moral Cultivation", "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840", "Humanities"], + "teacher_qualification": [ + "Teacher Qualification", + "\u6559\u5e08\u8d44\u683c", + "Social Science", + ], + "high_school_politics": [ + "High School Politics", + "\u9ad8\u4e2d\u653f\u6cbb", + "Social Science", + ], + "high_school_geography": [ + "High School Geography", + "\u9ad8\u4e2d\u5730\u7406", + "Social Science", + ], + "middle_school_politics": [ + "Middle School Politics", + "\u521d\u4e2d\u653f\u6cbb", + "Social Science", + ], + "middle_school_geography": [ + "Middle School Geography", + "\u521d\u4e2d\u5730\u7406", + "Social Science", + ], + "modern_chinese_history": [ + "Modern Chinese History", + "\u8fd1\u4ee3\u53f2\u7eb2\u8981", + "Humanities", + ], + "ideological_and_moral_cultivation": [ + "Ideological and Moral Cultivation", + "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840", + "Humanities", + ], "logic": ["Logic", "\u903b\u8f91\u5b66", "Humanities"], "law": ["Law", "\u6cd5\u5b66", "Humanities"], - "chinese_language_and_literature": ["Chinese Language and Literature", "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66", "Humanities"], + "chinese_language_and_literature": [ + "Chinese Language and Literature", + "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66", + "Humanities", + ], "art_studies": ["Art Studies", "\u827a\u672f\u5b66", "Humanities"], - "professional_tour_guide": ["Professional Tour Guide", "\u5bfc\u6e38\u8d44\u683c", "Humanities"], - "legal_professional": ["Legal Professional", "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c", "Humanities"], - "high_school_chinese": ["High School Chinese", "\u9ad8\u4e2d\u8bed\u6587", "Humanities"], - "high_school_history": ["High School History", "\u9ad8\u4e2d\u5386\u53f2", "Humanities"], - "middle_school_history": ["Middle School History", "\u521d\u4e2d\u5386\u53f2", "Humanities"], + "professional_tour_guide": [ + "Professional Tour Guide", + "\u5bfc\u6e38\u8d44\u683c", + "Humanities", + ], + "legal_professional": [ + "Legal Professional", + "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c", + "Humanities", + ], + "high_school_chinese": [ + "High School Chinese", + "\u9ad8\u4e2d\u8bed\u6587", + "Humanities", + ], + "high_school_history": [ + "High School History", + "\u9ad8\u4e2d\u5386\u53f2", + "Humanities", + ], + "middle_school_history": [ + "Middle School History", + "\u521d\u4e2d\u5386\u53f2", + "Humanities", + ], "civil_servant": ["Civil Servant", "\u516c\u52a1\u5458", "Other"], "sports_science": ["Sports Science", "\u4f53\u80b2\u5b66", "Other"], "plant_protection": ["Plant Protection", "\u690d\u7269\u4fdd\u62a4", "Other"], "basic_medicine": ["Basic Medicine", "\u57fa\u7840\u533b\u5b66", "Other"], "clinical_medicine": ["Clinical Medicine", "\u4e34\u5e8a\u533b\u5b66", "Other"], - "urban_and_rural_planner": ["Urban and Rural Planner", "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08", "Other"], + "urban_and_rural_planner": [ + "Urban and Rural Planner", + "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08", + "Other", + ], "accountant": ["Accountant", "\u6ce8\u518c\u4f1a\u8ba1\u5e08", "Other"], - "fire_engineer": ["Fire Engineer", "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08", "Other"], - "environmental_impact_assessment_engineer": ["Environmental Impact Assessment Engineer", "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08", "Other"], + "fire_engineer": [ + "Fire Engineer", + "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08", + "Other", + ], + "environmental_impact_assessment_engineer": [ + "Environmental Impact Assessment Engineer", + "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08", + "Other", + ], "tax_accountant": ["Tax Accountant", "\u7a0e\u52a1\u5e08", "Other"], - "physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"] + "physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"], } -hard_list = ['advanced_mathematics', 'discrete_mathematics', 'probability_and_statistics', 'college_physics', 'college_chemistry', 'high_school_mathematics', 'high_school_physics', 'high_school_chemistry'] +hard_list = [ + "advanced_mathematics", + "discrete_mathematics", + "probability_and_statistics", + "college_physics", + "college_chemistry", + "high_school_mathematics", + "high_school_physics", + "high_school_chemistry", +] choices = ["A", "B", "C", "D"] @@ -257,34 +410,50 @@ def main(args): print("model loaded") dev_result = {} for subject_name in tqdm(TASK_NAME_MAPPING.keys()): - val_file_path = os.path.join(args.eval_data_path, 'val', f'{subject_name}_val.csv') - # dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}_dev.csv') - # test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}_test.csv') + val_file_path = os.path.join( + args.eval_data_path, "val", f"{subject_name}_val.csv" + ) val_df = pd.read_csv(val_file_path) - # dev_df = pd.read_csv(dev_file_path) - # test_df = pd.read_csv(test_file_path) - score = eval_subject(model, tokenizer, subject_name, val_df, - save_result_dir=f"outs_chat/ceval_eval_result", overwrite=args.overwrite) + score = eval_subject( + model, + tokenizer, + subject_name, + val_df, + save_result_dir="outs_chat/ceval_eval_result", + overwrite=args.overwrite, + ) dev_result[subject_name] = score cal_ceval(dev_result) -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B-Chat") - parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed') - - """Provide extra arguments required for tasks.""" - group = parser.add_argument_group(title='Evaluation options') - group.add_argument('-d', '--eval_data_path', type=str, required=True, - help='Path to eval data') - group.add_argument("--debug", action='store_true', default=False, - help='Print infos.') - group.add_argument("--overwrite", action='store_true', default=False, - help='Overwrite existed results') +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=str, + help="Checkpoint path", + default="Qwen/Qwen-7B-Chat", + ) + parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed") + + # Provide extra arguments required for tasks + group = parser.add_argument_group(title="Evaluation options") + group.add_argument( + "-d", "--eval_data_path", type=str, required=True, help="Path to eval data" + ) + group.add_argument( + "--debug", action="store_true", default=False, help="Print infos." + ) + group.add_argument( + "--overwrite", + action="store_true", + default=False, + help="Overwrite existed results", + ) args = parser.parse_args() set_seed(args.seed) - main(args) \ No newline at end of file + main(args) diff --git a/eval/evaluate_chat_gsm8k.py b/eval/evaluate_chat_gsm8k.py index 1358264..c4de01e 100644 --- a/eval/evaluate_chat_gsm8k.py +++ b/eval/evaluate_chat_gsm8k.py @@ -1,15 +1,10 @@ -import random -import tqdm -import os -import re -import sys -import torch -import numpy as np -import jsonlines -import argparse import json +import re from pathlib import Path -from datasets import load_from_disk,load_dataset +import argparse +import numpy as np +import tqdm +from datasets import load_from_disk, load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationConfig @@ -18,39 +13,41 @@ python eval/evaluate_chat_gsm8k.py [--use-fewshot] ''' INVALID_ANS = "[invalid]" -DEVICE = "cuda:0" +DEVICE = "cuda:0" def doc_to_text(doc, use_fewshot): if use_fewshot: - context = "Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\n" \ - "Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n\n" \ - "Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws. Their opponents score double the 2 pointers but half the 3 pointers and free throws. What's the total number of points scored by both teams added together?\nLet's think step by step\n" \ - "Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\nHis team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\nThey scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\nAll together his team scored 50+24+10= 84 points\nMark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\nHis opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\nThey also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\nAll together Mark's opponents scored 100+12+5=117 points\nThe total score for the game is both team's scores added together, so it is 84+117=201 points\nThe answer is 201\n\n" \ - "Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nLet's think step by step\n" \ - "When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n\n" \ - "Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\n" \ - "For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n\n" \ - f"Question: {doc['question']}\nLet's think step by step" + context = ( + "Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\n" + "Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n\n" + "Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws. Their opponents score double the 2 pointers but half the 3 pointers and free throws. What's the total number of points scored by both teams added together?\nLet's think step by step\n" + "Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\nHis team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\nThey scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\nAll together his team scored 50+24+10= 84 points\nMark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\nHis opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\nThey also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\nAll together Mark's opponents scored 100+12+5=117 points\nThe total score for the game is both team's scores added together, so it is 84+117=201 points\nThe answer is 201\n\n" + "Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nLet's think step by step\n" + "When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n\n" + "Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\n" + "For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n\n" + f"Question: {doc['question']}\nLet's think step by step" + ) else: - context = doc['question'] + context = doc["question"] return context + def decode(tokens_list, tokenizer, raw_text_len): sents = [] - # print(len(tokens_list)) for tokens in tokens_list: tokens = tokens.cpu().numpy().tolist() - sent = tokenizer.tokenizer.decode( - tokens[raw_text_len:]) - sent = sent.split('<|endoftext|>')[0] - sent = sent.split('\n\n\n')[0] + sent = tokenizer.tokenizer.decode(tokens[raw_text_len:]) + sent = sent.split("<|endoftext|>")[0] + sent = sent.split("\n\n\n")[0] sent = sent.split("\n\n")[0] sent = sent.split("Question:")[0] sents.append(sent) return sents + def generate_sample(model, tokenizer, question): - response, history = model.chat( + response, _ = model.chat( tokenizer, question, history=None, @@ -64,7 +61,9 @@ def generate_sample(model, tokenizer, question): def extract_answer_hf(completion): def _get_last_digit(s): - _PAT_LAST_DIGIT = re.compile(r"(?<=(\s|[\$%#{]))([+-])?(?=(\S))(0|([1-9](\d*|\d{0,2}(,\d{3})*)))?(\.\d*[1-9])?(?=(\s|[.,}]|$))") + _PAT_LAST_DIGIT = re.compile( + r"(?<=(\s|[\$%#{]))([+-])?(?=(\S))(0|([1-9](\d*|\d{0,2}(,\d{3})*)))?(\.\d*[1-9])?(?=(\s|[.,}]|$))" + ) match = list(_PAT_LAST_DIGIT.finditer(s)) if match: last_digit = match[-1].group().replace(",", "").replace("+", "") @@ -74,51 +73,66 @@ def extract_answer_hf(completion): print(f"No digits found in {s!r}") return last_digit - job_gen = completion.strip('.').replace('\n', '\\n') + job_gen = completion.strip(".").replace("\n", "\\n") last_digit = _get_last_digit(job_gen) if last_digit is not None: return eval(last_digit) - else: - return INVALID_ANS + return INVALID_ANS + def extract_answer(completion): try: - last_number = re.findall(r'\d+', completion)[-1] + last_number = re.findall(r"\d+", completion)[-1] return eval(last_number) except: return INVALID_ANS -def is_correct( completion, answer): + +def is_correct(completion, answer): gold = extract_answer(answer) assert gold != INVALID_ANS, "No ground truth answer found in the document." return extract_answer(completion) == gold -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument("-c", "--checkpoint-path", type=Path, help="Checkpoint path", default="Qwen/Qwen-7B-Chat") - parser.add_argument("-f","--sample-input-file", type=str, default=None) - parser.add_argument("-o","--sample-output-file", type=str, default="gsm8k_res.jsonl") +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=Path, + help="Checkpoint path", + default="Qwen/Qwen-7B-Chat", + ) + parser.add_argument("-f", "--sample-input-file", type=str, default=None) + parser.add_argument( + "-o", "--sample-output-file", type=str, default="gsm8k_res.jsonl" + ) parser.add_argument("--use-fewshot", action="store_true") args = parser.parse_args() if args.sample_input_file is not None: - dataset = load_from_disk(args.sample_input_file)# or: + dataset = load_from_disk(args.sample_input_file) # or: else: dataset = load_dataset("gsm8k", "main") - print('Loading tokenizer ...') - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True, bf16=True, use_flash_attn=True) + print("Loading tokenizer ...") + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True, bf16=True, use_flash_attn=True + ) - print('Loading model ...') - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval() - model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) - model.generation_config.do_sample = False # use greedy decoding + print("Loading model ...") + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, device_map="auto", trust_remote_code=True + ).eval() + model.generation_config = GenerationConfig.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model.generation_config.do_sample = False # use greedy decoding test = dataset["test"] - f_output = open(args.sample_output_file, 'w', encoding='utf-8') + f_output = open(args.sample_output_file, "w", encoding="utf-8") tot_length = test.num_rows acc_res = [] for doc in tqdm.tqdm(test): @@ -132,6 +146,6 @@ if __name__ == '__main__': f_output.write(json.dumps(doc, ensure_ascii=False) + "\n") f_output.flush() acc_res.append(acc) - + f_output.close() print("4-shot Acc: " if args.use_fewshot else "Zero-shot Acc", np.mean(acc_res)) diff --git a/eval/evaluate_chat_humaneval.py b/eval/evaluate_chat_humaneval.py index c80c195..66dcec8 100644 --- a/eval/evaluate_chat_humaneval.py +++ b/eval/evaluate_chat_humaneval.py @@ -1,14 +1,10 @@ -import random -import tqdm -import os -import sys -import torch -import jsonlines -import argparse -import jsonlines -from pathlib import Path + import re import textwrap +import argparse +from pathlib import Path +import tqdm +import jsonlines from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationConfig @@ -24,25 +20,31 @@ evaluate_functional_correctness HumanEval_res.jsonl DEVICE = "cuda:0" def extract_code(text, entry_point): - # 正则表达式匹配代码块 - code_block_pattern = re.compile(rf"```(?:[Pp]ython\n)?.*?def\s+{entry_point}.*?:\n(.*?)\n```", re.DOTALL) + code_block_pattern = re.compile( + rf"```(?:[Pp]ython\n)?.*?def\s+{entry_point}.*?:\n(.*?)\n```", re.DOTALL + ) code_block = code_block_pattern.search(text) if code_block is None: - code_block_pattern = re.compile(rf"def\s+{entry_point}.*?:\n(.*?)(?:\n(?!\n*(?: |\t))|$)", re.DOTALL) + code_block_pattern = re.compile( + rf"def\s+{entry_point}.*?:\n(.*?)(?:\n(?!\n*(?: |\t))|$)", re.DOTALL + ) code_block = code_block_pattern.search(text) if code_block is None: - code_block_pattern = re.compile(rf"def.*?:\n(.*?)(?:\n(?!\n*(?: |\t))|$)", re.DOTALL) + code_block_pattern = re.compile( + r"def.*?:\n(.*?)(?:\n(?!\n*(?: |\t))|$)", re.DOTALL + ) code_block = code_block_pattern.search(text) if code_block is not None: return code_block.group(1) - else: - # if no code block is found, assume the LM is simply filling the code - return textwrap.indent(text, ' ' * 4) + + # if no code block is found, assume the LM is simply filling the code + return textwrap.indent(text, " " * 4) + def generate_sample(model, tokenizer, question, entry_point): - response, history = model.chat( + response, _ = model.chat( tokenizer, question, history=None, @@ -52,31 +54,56 @@ def generate_sample(model, tokenizer, question, entry_point): answer = extract_code(response, entry_point) return answer, response -if __name__ == '__main__': - - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument("-c", "--checkpoint-path", type=Path, help='Checkpoint path', default="Qwen/Qwen-7B-Chat") - parser.add_argument("-f","--sample-input-file", type=str, default=None, help="data path to HumanEval.jsonl") - parser.add_argument("-o","--sample-output-file", type=str, default="HumanEval_res.jsonl") +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=Path, + help="Checkpoint path", + default="Qwen/Qwen-7B-Chat", + ) + parser.add_argument( + "-f", + "--sample-input-file", + type=str, + default=None, + help="data path to HumanEval.jsonl", + ) + parser.add_argument( + "-o", "--sample-output-file", type=str, default="HumanEval_res.jsonl" + ) args = parser.parse_args() - print('Loading tokenizer ...') - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True) + print("Loading tokenizer ...") + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) - print('Loading model ...') - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True, bf16=True, use_flash_attn=True).eval() - model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) - model.generation_config.do_sample = False # use greedy decoding + print("Loading model ...") + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, + device_map="auto", + trust_remote_code=True, + bf16=True, + use_flash_attn=True, + ).eval() + model.generation_config = GenerationConfig.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model.generation_config.do_sample = False # use greedy decoding - f_output = jsonlines.Writer(open(args.sample_output_file, 'w', encoding='utf-8')) + f_output = jsonlines.Writer(open(args.sample_output_file, "w", encoding="utf-8")) f = jsonlines.open(args.sample_input_file) with f_output as output: - for jobj in tqdm.tqdm(f, desc='task_idx'): - prompt = "Help me fill the following code.\n" + jobj['prompt'] - task_id = jobj['task_id'] - answer, response = generate_sample(model, tokenizer, prompt, jobj['entry_point']) - gen_jobjs = {'task_id': task_id, "completion": answer, 'response': response} + for jobj in tqdm.tqdm(f, desc="task_idx"): + prompt = "Help me fill the following code.\n" + jobj["prompt"] + task_id = jobj["task_id"] + answer, response = generate_sample( + model, tokenizer, prompt, jobj["entry_point"] + ) + gen_jobjs = {"task_id": task_id, "completion": answer, "response": response} output.write(gen_jobjs) f_output.close() diff --git a/eval/evaluate_chat_mmlu.py b/eval/evaluate_chat_mmlu.py index 1fbf94e..259dc3a 100644 --- a/eval/evaluate_chat_mmlu.py +++ b/eval/evaluate_chat_mmlu.py @@ -1,14 +1,13 @@ import os -import pandas as pd -import numpy as np import argparse -import datasets -import torch import re -from thefuzz import process -from typing import List +import torch +import pandas as pd from tqdm import tqdm +from thefuzz import process from transformers.trainer_utils import set_seed +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers.generation import GenerationConfig ''' wget https://people.eecs.berkeley.edu/~hendrycks/data.tar @@ -22,18 +21,29 @@ python eval/evaluate_chat_mmlu.py -d data/mmlu/data/ ''' def load_models_tokenizer(args): - from transformers import AutoModelForCausalLM, AutoTokenizer - from transformers.generation import GenerationConfig - - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True, bf16=True, use_flash_attn=True).eval() - model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) - model.generation_config.do_sample = False # use greedy decoding + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, + device_map="auto", + trust_remote_code=True, + bf16=True, + use_flash_attn=True, + ).eval() + model.generation_config = GenerationConfig.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model.generation_config.do_sample = False # use greedy decoding return model, tokenizer def format_example(line): - example = 'The following is a multiple-choice question. Please choose the most suitable one among A, B, C and D as the answer to this question.\n\n' + line['question'] + "\n" + example = ( + "The following is a multiple-choice question. Please choose the most suitable one among A, B, C and D as the answer to this question.\n\n" + + line["question"] + + "\n" + ) for choice in choices: example += f'{choice}. {line[f"{choice}"]}\n' return example @@ -47,13 +57,20 @@ def process_before_extraction(gen, choice_dict): gen = pattern.sub(key, gen) return gen + def extract_choice(gen, choice_list): # answer is A | choice is A | choose A - res = re.search(r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^ABCD]{0,20}?(?:n't|not))[^ABCD]{0,10}?\b(?:|is|:|be))\b)[^ABCD]{0,20}?\b(A|B|C|D)\b", gen) + res = re.search( + r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^ABCD]{0,20}?(?:n't|not))[^ABCD]{0,10}?\b(?:|is|:|be))\b)[^ABCD]{0,20}?\b(A|B|C|D)\b", + gen, + ) # A is correct | A is right if res is None: - res = re.search(r"\b(A|B|C|D)\b(?![^ABCD]{0,8}?(?:n't|not)[^ABCD]{0,5}?(?:correct|right))[^ABCD]{0,10}?\b(?:correct|right)\b", gen) + res = re.search( + r"\b(A|B|C|D)\b(?![^ABCD]{0,8}?(?:n't|not)[^ABCD]{0,5}?(?:correct|right))[^ABCD]{0,10}?\b(?:correct|right)\b", + gen, + ) # straight answer: A if res is None: @@ -65,32 +82,37 @@ def extract_choice(gen, choice_list): if res is None: return choices[choice_list.index(process.extractOne(gen, choice_list)[0])] - else: - return res.group(1) + return res.group(1) + def extract_answer(response, row): - gen = process_before_extraction(response, {choice: row[choice] for choice in choices}) + gen = process_before_extraction( + response, {choice: row[choice] for choice in choices} + ) pred = extract_choice(gen, [row[choice] for choice in choices]) return pred + @torch.no_grad() def eval_subject( - model, - tokenizer, - subject_name, - test_df, - save_result_dir=None, - overwrite=False, - **kwargs + model, + tokenizer, + subject_name, + test_df, + save_result_dir=None, + overwrite=False, + **kwargs ): - result_path = os.path.join(save_result_dir, f'{subject_name}_result.csv') + result_path = os.path.join(save_result_dir, f"{subject_name}_result.csv") if not overwrite and os.path.exists(result_path): print(f"{result_path} existed, skip!") score = [] - for (_, datarow), (_, resultrow) in zip(test_df.iterrows(), pd.read_csv(result_path).iterrows()): + for (_, datarow), (_, resultrow) in zip( + test_df.iterrows(), pd.read_csv(result_path).iterrows() + ): # pred = extract_answer(resultrow['model_response'], datarow) - pred = resultrow['model_output'] - correct = 1 if pred == datarow['answer'] else 0 + pred = resultrow["model_output"] + correct = 1 if pred == datarow["answer"] else 0 score.append(correct) return score @@ -100,7 +122,7 @@ def eval_subject( for _, row in tqdm(test_df.iterrows(), total=len(test_df)): question = format_example(row) - response, history = model.chat( + response, _ = model.chat( tokenizer, question, history=None, @@ -111,20 +133,24 @@ def eval_subject( print(pred) print("======================") - if 'answer' in row: - correct = 1 if pred == row['answer'] else 0 + if "answer" in row: + correct = 1 if pred == row["answer"] else 0 score.append(correct) - if args.debug: print(f'{question} pred: {pred} ref: {row["answer"]}') + if args.debug: + print(f'{question} pred: {pred} ref: {row["answer"]}') result.append(pred) if save_result_dir: - test_df['model_output'] = result - test_df['model_response'] = response + test_df["model_output"] = result + test_df["model_response"] = response if score: test_df["correctness"] = score os.makedirs(save_result_dir, exist_ok=True) - test_df.to_csv(os.path.join( - save_result_dir, f'{subject_name}_result.csv'), encoding="utf-8", index=False) + test_df.to_csv( + os.path.join(save_result_dir, f"{subject_name}_result.csv"), + encoding="utf-8", + index=False, + ) return score @@ -133,15 +159,13 @@ def cal_mmlu(res): acc_sum_dict = dict() acc_norm_sum_dict = dict() cnt_dict = dict() - acc_sum = 0. + acc_sum = 0.0 cnt = 0 - hard_cnt = 0 - hard_acc_sum = 0. for class_ in TASK_NAME_MAPPING.keys(): - acc_sum_dict[class_] = 0. - acc_norm_sum_dict[class_] = 0. - cnt_dict[class_] = 0. + acc_sum_dict[class_] = 0.0 + acc_norm_sum_dict[class_] = 0.0 + cnt_dict[class_] = 0.0 for tt in TASK_NAME_MAPPING[class_]: acc_sum += sum(res[tt]) @@ -150,13 +174,12 @@ def cal_mmlu(res): acc_sum_dict[class_] += sum(res[tt]) cnt_dict[class_] += len(res[tt]) - print('\n\n\n') + print("\n\n\n") for k in TASK_NAME_MAPPING.keys(): if k in cnt_dict: - print('%s ACC: %.2f ' % ( - k, acc_sum_dict[k] * 100 / cnt_dict[k])) - print('AVERAGE ACC:%.2f ' % (acc_sum *100 / cnt)) - + print("%s ACC: %.2f " % (k, acc_sum_dict[k] * 100 / cnt_dict[k])) + print("AVERAGE ACC:%.2f " % (acc_sum * 100 / cnt)) + def main(args): print("loading model weights") @@ -170,38 +193,122 @@ def main(args): for subject_name in tqdm(SUBJECTS): # val_file_path = os.path.join(args.eval_data_path, 'val', f'{subject_name}_val.csv') # dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}_dev.csv') - test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}_test.csv') + test_file_path = os.path.join( + args.eval_data_path, "test", f"{subject_name}_test.csv" + ) # val_df = pd.read_csv(val_file_path, names=['question','A','B','C','D','answer']) # dev_df = pd.read_csv(dev_file_path, names=['question','A','B','C','D','answer']) - test_df = pd.read_csv(test_file_path, names=['question','A','B','C','D','answer']) + test_df = pd.read_csv( + test_file_path, names=["question", "A", "B", "C", "D", "answer"] + ) - score = eval_subject(model, tokenizer, subject_name, test_df, save_result_dir=f"outs_chat/mmlu_eval_result", overwrite=args.overwrite) + score = eval_subject( + model, + tokenizer, + subject_name, + test_df, + save_result_dir=f"outs_chat/mmlu_eval_result", + overwrite=args.overwrite, + ) dev_result[subject_name] = score cal_mmlu(dev_result) -TASK_NAME_MAPPING = {'stem': ['abstract_algebra', 'anatomy', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning'], - 'Humanities': ['formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions'], - 'other': ['business_ethics', 'college_medicine', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology', 'global_facts', 'clinical_knowledge'], - 'social': ['econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy']} +TASK_NAME_MAPPING = { + "stem": [ + "abstract_algebra", + "anatomy", + "astronomy", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_physics", + "computer_security", + "conceptual_physics", + "electrical_engineering", + "elementary_mathematics", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_mathematics", + "high_school_physics", + "high_school_statistics", + "machine_learning", + ], + "Humanities": [ + "formal_logic", + "high_school_european_history", + "high_school_us_history", + "high_school_world_history", + "international_law", + "jurisprudence", + "logical_fallacies", + "moral_disputes", + "moral_scenarios", + "philosophy", + "prehistory", + "professional_law", + "world_religions", + ], + "other": [ + "business_ethics", + "college_medicine", + "human_aging", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "nutrition", + "professional_accounting", + "professional_medicine", + "virology", + "global_facts", + "clinical_knowledge", + ], + "social": [ + "econometrics", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_microeconomics", + "high_school_psychology", + "human_sexuality", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + ], +} SUBJECTS = [v for vl in TASK_NAME_MAPPING.values() for v in vl] choices = ["A", "B", "C", "D"] -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B-Chat") - parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed') - - """Provide extra arguments required for tasks.""" - group = parser.add_argument_group(title='Evaluation options') - group.add_argument('-d', '--eval_data_path', type=str, - help='Path to eval data') - group.add_argument("--debug", action='store_true', default=False, - help='Print infos.') - group.add_argument("--overwrite", action='store_true', default=False, - help='Overwrite existed results') +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=str, + help="Checkpoint path", + default="Qwen/Qwen-7B-Chat", + ) + parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed") + + # Provide extra arguments required for tasks + group = parser.add_argument_group(title="Evaluation options") + group.add_argument("-d", "--eval_data_path", type=str, help="Path to eval data") + group.add_argument( + "--debug", action="store_true", default=False, help="Print infos." + ) + group.add_argument( + "--overwrite", + action="store_true", + default=False, + help="Overwrite existed results", + ) args = parser.parse_args() set_seed(args.seed) - main(args) \ No newline at end of file + main(args) diff --git a/eval/evaluate_cmmlu.py b/eval/evaluate_cmmlu.py index aafcc57..2d2371d 100644 --- a/eval/evaluate_cmmlu.py +++ b/eval/evaluate_cmmlu.py @@ -11,39 +11,46 @@ from tqdm import tqdm from transformers.trainer_utils import set_seed -''' +""" wget https://huggingface.co/datasets/haonan-li/cmmlu/resolve/main/cmmlu_v1_0_1.zip mkdir data/cmmlu mv cmmlu_v1_0_1.zip data/cmmlu cd data/cmmlu; unzip cmmlu_v1_0_1.zip cd ../../ python evaluate_cmmlu.py -d data/cmmlu/ -''' +""" + def load_models_tokenizer(args): from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationConfig - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval() - model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, device_map="auto", trust_remote_code=True + ).eval() + model.generation_config = GenerationConfig.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) return model, tokenizer def format_example(line, include_answer=True): - example = '问题:' + line['Question'] + example = "问题:" + line["Question"] for choice in choices: example += f'\n{choice}. {line[f"{choice}"]}' if include_answer: - example += '\n答案:' + line["Answer"] + '\n\n' + example += "\n答案:" + line["Answer"] + "\n\n" else: - example += '\n答案:' + example += "\n答案:" return example def generate_few_shot_prompt(k, subject, dev_df): - prompt = '' + prompt = "" if k == -1: k = dev_df.shape[0] for i in range(k): @@ -55,35 +62,37 @@ def generate_few_shot_prompt(k, subject, dev_df): def get_logits(tokenizer, model, inputs: List[str]): - input_ids = tokenizer(inputs, padding=False)['input_ids'] + input_ids = tokenizer(inputs, padding=False)["input_ids"] input_ids = torch.tensor(input_ids, device=model.device) - tokens = {'input_ids': input_ids} + tokens = {"input_ids": input_ids} - outputs = model(input_ids)['logits'] + outputs = model(input_ids)["logits"] logits = outputs[:, -1, :] log_probs = torch.nn.functional.softmax(logits, dim=-1) - return log_probs, {'tokens': tokens} + return log_probs, {"tokens": tokens} @torch.no_grad() def eval_subject( - model, - tokenizer, - subject_name, - test_df, - k=5, - dev_df=None, - few_shot=False, - save_result_dir=None, - **kwargs + model, + tokenizer, + subject_name, + test_df, + k=5, + dev_df=None, + few_shot=False, + save_result_dir=None, + **kwargs, ): result = [] score = [] - few_shot_prompt = generate_few_shot_prompt( - k, subject_name, dev_df) if few_shot else [] - all_probs = {'prob_A': [], 'prob_B': [], 'prob_C': [], 'prob_D': []} - if args.debug: print(f"few_shot_prompt: {few_shot_prompt}") + few_shot_prompt = ( + generate_few_shot_prompt(k, subject_name, dev_df) if few_shot else [] + ) + all_probs = {"prob_A": [], "prob_B": [], "prob_C": [], "prob_D": []} + if args.debug: + print(f"few_shot_prompt: {few_shot_prompt}") for _, row in tqdm(test_df.iterrows(), total=len(test_df)): question = format_example(row, include_answer=False) @@ -94,51 +103,56 @@ def eval_subject( logits = output.flatten() softval = torch.nn.functional.softmax( - torch.tensor( - [ - logits[tokenizer("A")['input_ids']], - logits[tokenizer("B")['input_ids']], - logits[tokenizer("C")['input_ids']], - logits[tokenizer("D")['input_ids']], - ] - ), - dim=0, - ) + torch.tensor( + [ + logits[tokenizer("A")["input_ids"]], + logits[tokenizer("B")["input_ids"]], + logits[tokenizer("C")["input_ids"]], + logits[tokenizer("D")["input_ids"]], + ] + ), + dim=0, + ) if softval.dtype in {torch.bfloat16, torch.float16}: softval = softval.to(dtype=torch.float32) probs = softval.detach().cpu().numpy() for i, choice in enumerate(choices): - all_probs[f'prob_{choice}'].append(probs[i]) + all_probs[f"prob_{choice}"].append(probs[i]) pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)] - if 'Answer' in row: - correct = 1 if pred == row['Answer'] else 0 + if "Answer" in row: + correct = 1 if pred == row["Answer"] else 0 score.append(correct) - if args.debug: print(f'{question} pred: {pred} ref: {row["Answer"]}') + if args.debug: + print(f'{question} pred: {pred} ref: {row["Answer"]}') result.append(pred) if score: correct_ratio = 100 * sum(score) / len(score) - if args.debug: print(subject_name, correct_ratio) + if args.debug: + print(subject_name, correct_ratio) else: correct_ratio = 0 if save_result_dir: - test_df['model_output'] = result + test_df["model_output"] = result for i, choice in enumerate(choices): - test_df[f'prob_{choice}'] = (all_probs[f'prob_{choice}']) + test_df[f"prob_{choice}"] = all_probs[f"prob_{choice}"] if score: test_df["correctness"] = score os.makedirs(save_result_dir, exist_ok=True) - test_df.to_csv(os.path.join( - save_result_dir, f'{subject_name}_result.csv'), encoding="utf-8", index=False) + test_df.to_csv( + os.path.join(save_result_dir, f"{subject_name}_result.csv"), + encoding="utf-8", + index=False, + ) return correct_ratio def cal_cmmlu(res): - print('\n\n\n') - res = {k.split('-')[-1]:float(v) for k,v in res.items()} + print("\n\n\n") + res = {k.split("-")[-1]: float(v) for k, v in res.items()} for k, v in TASK_NAME_MAPPING.items(): avg_acc = np.mean(list(map(lambda x: res[x], v))) print(f"{k} acc: {avg_acc:.2f}") @@ -147,85 +161,103 @@ def cal_cmmlu(res): subcategories = { - "agronomy": ['other'], - "anatomy": ['biology'], - "ancient_chinese": ['linguistics','china specific'], - "arts": ['arts'], - "astronomy": ['physics'], - "business_ethics": ['business'], - "chinese_civil_service_exam": ['politics','china specific'], - "chinese_driving_rule": ['other','china specific'], - "chinese_food_culture": ['culture','china specific'], - "chinese_foreign_policy": ['politics','china specific'], - "chinese_history":['history','china specific'], - "chinese_literature": ['literature','china specific'], - "chinese_teacher_qualification": ['education','china specific'], - "college_actuarial_science":['math'], - "college_education":['education'], - "college_engineering_hydrology": ['engineering'], - "college_law": ['law'], - "college_mathematics": ['math'], - "college_medical_statistics":['statistics'], - "clinical_knowledge": ['other'], - "college_medicine": ['other'], - "computer_science": ['computer science'], - "computer_security": ['other'], - "conceptual_physics": ['physics'], - "construction_project_management": ['other','china specific'], - "economics": ['economics'], - "education": ['education'], - "elementary_chinese":['linguistics','china specific'], - "elementary_commonsense":['other','china specific'], - "elementary_information_and_technology": ['other'], - "electrical_engineering": ['engineering'], - "elementary_mathematics": ['math'], - "ethnology": ['culture','china specific'], - "food_science": ['other'], - "genetics": ['biology'], - "global_facts": ['global'], - "high_school_biology": ['biology'], - "high_school_chemistry": ['chemistry'], - "high_school_geography": ['geography'], - "high_school_mathematics": ['math'], - "high_school_physics": ['physics'], - "high_school_politics": ['politics','china specific'], - "human_sexuality": ['other'], - "international_law": ['law'], - "journalism": ['sociology'], - "jurisprudence": ['law'], - "legal_and_moral_basis": ['other'], - "logical": ['philosophy'], - "machine_learning": ['computer science'], - "management": ['business'], - "marketing": ['business'], - "marxist_theory": ['philosophy'], - "modern_chinese": ['linguistics','china specific'], - "nutrition": ['other'], - "philosophy": ['philosophy'], - "professional_accounting": ['business'], - "professional_law": ['law'], - "professional_medicine": ['other'], - "professional_psychology": ['psychology'], - "public_relations": ['politics'], - "security_study": ['politics'], - "sociology": ['culture'], - "sports_science": ['other'], - "traditional_chinese_medicine": ['other','china specific'], - "virology": ['biology'], - "world_history":['history'], - "world_religions": ['global'], + "agronomy": ["other"], + "anatomy": ["biology"], + "ancient_chinese": ["linguistics", "china specific"], + "arts": ["arts"], + "astronomy": ["physics"], + "business_ethics": ["business"], + "chinese_civil_service_exam": ["politics", "china specific"], + "chinese_driving_rule": ["other", "china specific"], + "chinese_food_culture": ["culture", "china specific"], + "chinese_foreign_policy": ["politics", "china specific"], + "chinese_history": ["history", "china specific"], + "chinese_literature": ["literature", "china specific"], + "chinese_teacher_qualification": ["education", "china specific"], + "college_actuarial_science": ["math"], + "college_education": ["education"], + "college_engineering_hydrology": ["engineering"], + "college_law": ["law"], + "college_mathematics": ["math"], + "college_medical_statistics": ["statistics"], + "clinical_knowledge": ["other"], + "college_medicine": ["other"], + "computer_science": ["computer science"], + "computer_security": ["other"], + "conceptual_physics": ["physics"], + "construction_project_management": ["other", "china specific"], + "economics": ["economics"], + "education": ["education"], + "elementary_chinese": ["linguistics", "china specific"], + "elementary_commonsense": ["other", "china specific"], + "elementary_information_and_technology": ["other"], + "electrical_engineering": ["engineering"], + "elementary_mathematics": ["math"], + "ethnology": ["culture", "china specific"], + "food_science": ["other"], + "genetics": ["biology"], + "global_facts": ["global"], + "high_school_biology": ["biology"], + "high_school_chemistry": ["chemistry"], + "high_school_geography": ["geography"], + "high_school_mathematics": ["math"], + "high_school_physics": ["physics"], + "high_school_politics": ["politics", "china specific"], + "human_sexuality": ["other"], + "international_law": ["law"], + "journalism": ["sociology"], + "jurisprudence": ["law"], + "legal_and_moral_basis": ["other"], + "logical": ["philosophy"], + "machine_learning": ["computer science"], + "management": ["business"], + "marketing": ["business"], + "marxist_theory": ["philosophy"], + "modern_chinese": ["linguistics", "china specific"], + "nutrition": ["other"], + "philosophy": ["philosophy"], + "professional_accounting": ["business"], + "professional_law": ["law"], + "professional_medicine": ["other"], + "professional_psychology": ["psychology"], + "public_relations": ["politics"], + "security_study": ["politics"], + "sociology": ["culture"], + "sports_science": ["other"], + "traditional_chinese_medicine": ["other", "china specific"], + "virology": ["biology"], + "world_history": ["history"], + "world_religions": ["global"], } categories = { - "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering", "statistics"], + "STEM": [ + "physics", + "chemistry", + "biology", + "computer science", + "math", + "engineering", + "statistics", + ], "Humanities": ["history", "philosophy", "law", "arts", "literature", "global"], - "Social Science": ['linguistics',"business", "politics", "culture", "economics", "geography", "psychology", "education", "sociology"], - "Other":["other"], + "Social Science": [ + "linguistics", + "business", + "politics", + "culture", + "economics", + "geography", + "psychology", + "education", + "sociology", + ], + "Other": ["other"], "China specific": ["china specific"], } TASK_NAME_MAPPING = defaultdict(list) -for k,v in categories.items(): +for k, v in categories.items(): for subject, subcat in subcategories.items(): for c in subcat: if c in v: @@ -240,30 +272,52 @@ def main(args): test_result = {} for subject_name in tqdm(subcategories.keys()): - dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}.csv') - test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}.csv') + dev_file_path = os.path.join(args.eval_data_path, "dev", f"{subject_name}.csv") + test_file_path = os.path.join( + args.eval_data_path, "test", f"{subject_name}.csv" + ) dev_df = pd.read_csv(dev_file_path) test_df = pd.read_csv(test_file_path) - score = eval_subject(model, tokenizer, subject_name, dev_df=dev_df, test_df=test_df, k=5, few_shot=True, - save_result_dir=f"outs/cmmlu_eval_result") + score = eval_subject( + model, + tokenizer, + subject_name, + dev_df=dev_df, + test_df=test_df, + k=5, + few_shot=True, + save_result_dir=f"outs/cmmlu_eval_result", + ) test_result[subject_name] = score cal_cmmlu(test_result) -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B") - parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed') +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=str, + help="Checkpoint path", + default="Qwen/Qwen-7B", + ) + parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed") """Provide extra arguments required for tasks.""" - group = parser.add_argument_group(title='Evaluation options') - group.add_argument('-d', '--eval_data_path', type=str, required=True, - help='Path to eval data') - group.add_argument("--max-seq-len", type=int, default=2048, - help='Size of the output generated text.') - group.add_argument("--debug", action='store_true', default=False, - help='Print infos.') + group = parser.add_argument_group(title="Evaluation options") + group.add_argument( + "-d", "--eval_data_path", type=str, required=True, help="Path to eval data" + ) + group.add_argument( + "--max-seq-len", + type=int, + default=2048, + help="Size of the output generated text.", + ) + group.add_argument( + "--debug", action="store_true", default=False, help="Print infos." + ) args = parser.parse_args() set_seed(args.seed) diff --git a/eval/evaluate_gsm8k.py b/eval/evaluate_gsm8k.py index 49d69c8..d3c5d37 100644 --- a/eval/evaluate_gsm8k.py +++ b/eval/evaluate_gsm8k.py @@ -1,15 +1,10 @@ -import random -import tqdm -import os import re -import sys import torch -import numpy as np -import jsonlines import argparse import jsonlines +import numpy as np import datasets -from datasets import load_from_disk,load_dataset +from datasets import load_from_disk, load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationConfig @@ -17,31 +12,37 @@ from transformers.generation import GenerationConfig ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)") INVALID_ANS = "[invalid]" + def doc_to_text(doc): - return fewshot_prompt + "\nQuestion: " + doc["question"] + "\nLet's think step by step\n" + return ( + fewshot_prompt + + "\nQuestion: " + + doc["question"] + + "\nLet's think step by step\n" + ) + def decode(tokens_list, tokenizer, raw_text_len): sents = [] # print(len(tokens_list)) for tokens in tokens_list: tokens = tokens.cpu().numpy().tolist() - sent = tokenizer.tokenizer.decode( - tokens[raw_text_len:]) - sent = sent.split('<|endoftext|>')[0] - sent = sent.split('\n\n\n')[0] + sent = tokenizer.tokenizer.decode(tokens[raw_text_len:]) + sent = sent.split("<|endoftext|>")[0] + sent = sent.split("\n\n\n")[0] sent = sent.split("\n\n")[0] sent = sent.split("Question:")[0] sents.append(sent) return sents + def generate_sample(model, tokenizer, input_txt): input_ids = tokenizer.tokenizer.encode(input_txt) raw_text_len = len(input_ids) - context_enc = torch.tensor( - [input_ids]).to(model.device) + context_enc = torch.tensor([input_ids]).to(model.device) print(f"Input text: {input_txt}\n") outputs = model.generate(context_enc) - output_text = decode(outputs,tokenizer,raw_text_len)[0] + output_text = decode(outputs, tokenizer, raw_text_len)[0] print(f"\nOutput text: {output_text}\n") return output_text @@ -55,24 +56,34 @@ def extract_answer_hf(completion): else: return INVALID_ANS + def extract_answer(completion): try: - last_number = re.findall(r'\d+', completion)[-1] + last_number = re.findall(r"\d+", completion)[-1] return eval(last_number) except: return INVALID_ANS -def is_correct( completion, answer): + +def is_correct(completion, answer): gold = extract_answer_hf(answer) assert gold != INVALID_ANS, "No ground truth answer found in the document." return extract_answer(completion) == gold -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument("-c", "--checkpoint-path", type=str, help="Checkpoint path", default="Qwen/Qwen-7B") - parser.add_argument("-f","--sample-input-file", type=str, default=None) - parser.add_argument("-o","--sample-output-file", type=str, default="gsm8k_res.jsonl") +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=str, + help="Checkpoint path", + default="Qwen/Qwen-7B", + ) + parser.add_argument("-f", "--sample-input-file", type=str, default=None) + parser.add_argument( + "-o", "--sample-output-file", type=str, default="gsm8k_res.jsonl" + ) args = parser.parse_args() @@ -80,31 +91,37 @@ if __name__ == '__main__': if args.sample_input_file is not None: dataset = load_from_disk(args.sample_input_file) else: - config = datasets.DownloadConfig(resume_download=True, max_retries=100) - dataset = load_dataset("gsm8k", 'main', download_config=config) + config = datasets.DownloadConfig(resume_download=True, max_retries=100) + dataset = load_dataset("gsm8k", "main", download_config=config) test = dataset["test"] - print('Loading tokenizer ...') - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True) - - print('Loading model ...') - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval() - model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) + print("Loading tokenizer ...") + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + + print("Loading model ...") + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, device_map="auto", trust_remote_code=True + ).eval() + model.generation_config = GenerationConfig.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) model.generation_config.do_sample = False - - f_output = jsonlines.Writer(open(args.sample_output_file, 'w', encoding='utf-8')) + + f_output = jsonlines.Writer(open(args.sample_output_file, "w", encoding="utf-8")) tot_length = test.num_rows acc_res = [] for doc in test: context = doc_to_text(doc) completion = generate_sample(model, tokenizer, context) - answer= doc["answer"] + answer = doc["answer"] acc = is_correct(completion, answer) - doc["completion"]=completion - doc["acc"]=acc + doc["completion"] = completion + doc["acc"] = acc f_output.write(doc) acc_res.append(acc) - + f_output.close() - print("Acc: ",np.mean(acc_res)) \ No newline at end of file + print("Acc: ", np.mean(acc_res)) diff --git a/eval/evaluate_humaneval.py b/eval/evaluate_humaneval.py index af78319..78eb744 100644 --- a/eval/evaluate_humaneval.py +++ b/eval/evaluate_humaneval.py @@ -1,11 +1,7 @@ -import random +import argparse import tqdm -import os -import sys import torch import jsonlines -import argparse -import jsonlines from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationConfig @@ -15,56 +11,75 @@ $ pip install -e human-eval evaluate_functional_correctness sample-output-file """ + def decode(tokens_list, tokenizer, raw_text_len): sents = [] # print(len(tokens_list)) for tokens in tokens_list: tokens = tokens.cpu().numpy().tolist() - sent = tokenizer.tokenizer.decode( - tokens[raw_text_len:]) - sent = sent.split('<|endoftext|>')[0] - sent = sent.split('\n\n\n')[0] + sent = tokenizer.tokenizer.decode(tokens[raw_text_len:]) + sent = sent.split("<|endoftext|>")[0] + sent = sent.split("\n\n\n")[0] sent = sent.split("\n\n")[0] sent = sent.split("def ")[0] sents.append(sent) return sents + def generate_sample(model, tokenizer, input_txt): input_ids = tokenizer.tokenizer.encode(input_txt) raw_text_len = len(input_ids) - context_enc = torch.tensor([input_ids] ).to(model.device) + context_enc = torch.tensor([input_ids]).to(model.device) print(f"Input text: {input_txt}\n") outputs = model.generate(context_enc) - output_text = decode(outputs,tokenizer,raw_text_len)[0] + output_text = decode(outputs, tokenizer, raw_text_len)[0] print(f"\nOutput text: \n{output_text}\n") return output_text -if __name__ == '__main__': - - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument("-c", "--checkpoint-path", type=str, help='Checkpoint path', default="Qwen/Qwen-7B") - parser.add_argument("-f","--sample-input-file", type=str, default=None, help="data path to HumanEval.jsonl") - parser.add_argument("-o","--sample-output-file", type=str, default="HumanEval_res.jsonl") - +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=str, + help="Checkpoint path", + default="Qwen/Qwen-7B", + ) + parser.add_argument( + "-f", + "--sample-input-file", + type=str, + default=None, + help="data path to HumanEval.jsonl", + ) + parser.add_argument( + "-o", "--sample-output-file", type=str, default="HumanEval_res.jsonl" + ) args = parser.parse_args() - print('Loading tokenizer ...') - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True) + print("Loading tokenizer ...") + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) - print('Loading model ...') - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval() - model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) + print("Loading model ...") + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, device_map="auto", trust_remote_code=True + ).eval() + model.generation_config = GenerationConfig.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) model.generation_config.do_sample = False - - f_output = jsonlines.Writer(open(args.sample_output_file, 'w', encoding='utf-8')) + + f_output = jsonlines.Writer(open(args.sample_output_file, "w", encoding="utf-8")) f = jsonlines.open(args.sample_input_file) with f_output as output: - for jobj in tqdm.tqdm(f, desc='task_idx'): - prompt = jobj['prompt'] - task_id = jobj['task_id'] + for jobj in tqdm.tqdm(f, desc="task_idx"): + prompt = jobj["prompt"] + task_id = jobj["task_id"] gen_sents = generate_sample(model, tokenizer, prompt) - gen_jobjs = {'task_id': task_id, "completion": gen_sents} + gen_jobjs = {"task_id": task_id, "completion": gen_sents} output.write(gen_jobjs) - f_output.close() \ No newline at end of file + f_output.close() diff --git a/eval/evaluate_mmlu.py b/eval/evaluate_mmlu.py index 1b6970c..2843434 100644 --- a/eval/evaluate_mmlu.py +++ b/eval/evaluate_mmlu.py @@ -1,57 +1,60 @@ import os +from typing import List import pandas as pd import numpy as np import argparse -import datasets import torch - -from typing import List from tqdm import tqdm from transformers.trainer_utils import set_seed +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers.generation import GenerationConfig - -''' +""" wget https://people.eecs.berkeley.edu/~hendrycks/data.tar mkdir data/mmlu mv data.tar data/mmlu cd data/mmlu; tar xf data.tar cd ../../ python eval/evaluate_mmlu.py -d data/mmlu/data/ -''' +""" def load_models_tokenizer(args): - from transformers import AutoModelForCausalLM, AutoTokenizer - from transformers.generation import GenerationConfig - - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval() - model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, device_map="auto", trust_remote_code=True + ).eval() + model.generation_config = GenerationConfig.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) return model, tokenizer def format_example(line, include_answer=True): - example = 'Question: ' + line['question'] + example = "Question: " + line["question"] for choice in choices: example += f'\n{choice}. {line[f"{choice}"]}' - + if include_answer: - example += '\nAnswer: ' + line["answer"] + '\n\n' + example += "\nAnswer: " + line["answer"] + "\n\n" else: - example += '\nAnswer:' + example += "\nAnswer:" return example def generate_few_shot_prompt(k, subject, dev_df): - def format_subject(subject): l = subject.split("_") s = "" for entry in l: s += " " + entry return s.strip() - - prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(format_subject(subject)) + + prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format( + format_subject(subject) + ) if k == -1: k = dev_df.shape[0] @@ -64,81 +67,87 @@ def generate_few_shot_prompt(k, subject, dev_df): def get_logits(tokenizer, model, inputs: List[str]): - input_ids = tokenizer(inputs, padding=False)['input_ids'] + input_ids = tokenizer(inputs, padding=False)["input_ids"] input_ids = torch.tensor(input_ids, device=model.device) if input_ids.shape[1] > args.max_seq_len: - input_ids = input_ids[:, input_ids.shape[1]-args.max_seq_len+1:] - tokens = {'input_ids': input_ids} + input_ids = input_ids[:, input_ids.shape[1] - args.max_seq_len + 1 :] + tokens = {"input_ids": input_ids} - outputs = model(input_ids)['logits'] + outputs = model(input_ids)["logits"] logits = outputs[:, -1, :] log_probs = torch.nn.functional.softmax(logits, dim=-1) - return log_probs, {'tokens': tokens} + return log_probs, {"tokens": tokens} @torch.no_grad() def eval_subject( - model, - tokenizer, - subject_name, - test_df, - k=5, - dev_df=None, - few_shot=False, - save_result_dir=None, - **kwargs + model, + tokenizer, + subject_name, + test_df, + k=5, + dev_df=None, + few_shot=False, + save_result_dir=None, + **kwargs, ): result = [] score = [] - few_shot_prompt = generate_few_shot_prompt( - k, subject_name, dev_df) if few_shot else [] - all_probs = {'prob_A': [], 'prob_B': [], 'prob_C': [], 'prob_D': []} - if args.debug: print(f"few_shot_prompt: {few_shot_prompt}") + few_shot_prompt = ( + generate_few_shot_prompt(k, subject_name, dev_df) if few_shot else [] + ) + all_probs = {"prob_A": [], "prob_B": [], "prob_C": [], "prob_D": []} + if args.debug: + print(f"few_shot_prompt: {few_shot_prompt}") for _, row in tqdm(test_df.iterrows(), total=len(test_df)): question = format_example(row, include_answer=False) full_prompt = few_shot_prompt + question - + output, input_info = get_logits(tokenizer, model, [full_prompt]) assert output.shape[0] == 1 logits = output.flatten() softval = torch.nn.functional.softmax( - torch.tensor( - [ - logits[tokenizer(" A")['input_ids']], - logits[tokenizer(" B")['input_ids']], - logits[tokenizer(" C")['input_ids']], - logits[tokenizer(" D")['input_ids']], - ] - ), - dim=0, - ) + torch.tensor( + [ + logits[tokenizer(" A")["input_ids"]], + logits[tokenizer(" B")["input_ids"]], + logits[tokenizer(" C")["input_ids"]], + logits[tokenizer(" D")["input_ids"]], + ] + ), + dim=0, + ) if softval.dtype in {torch.bfloat16, torch.float16}: softval = softval.to(dtype=torch.float32) probs = softval.detach().cpu().numpy() for i, choice in enumerate(choices): - all_probs[f'prob_{choice}'].append(probs[i]) + all_probs[f"prob_{choice}"].append(probs[i]) pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)] - if 'answer' in row: - correct = 1 if pred == row['answer'] else 0 + if "answer" in row: + correct = 1 if pred == row["answer"] else 0 score.append(correct) - if args.debug: print(f'{question} pred: {pred} ref: {row["answer"]}') + if args.debug: + print(f'{question} pred: {pred} ref: {row["answer"]}') result.append(pred) if save_result_dir: - test_df['model_output'] = result + test_df["model_output"] = result for i, choice in enumerate(choices): - test_df[f'prob_{choice}'] = (all_probs[f'prob_{choice}']) + test_df[f"prob_{choice}"] = all_probs[f"prob_{choice}"] if score: test_df["correctness"] = score os.makedirs(save_result_dir, exist_ok=True) - test_df.to_csv(os.path.join( - save_result_dir, f'{subject_name}_result.csv'), encoding="utf-8", index=False) + test_df.to_csv( + os.path.join(save_result_dir, f"{subject_name}_result.csv"), + encoding="utf-8", + index=False, + ) return score @@ -147,15 +156,15 @@ def cal_mmlu(res): acc_sum_dict = dict() acc_norm_sum_dict = dict() cnt_dict = dict() - acc_sum = 0. + acc_sum = 0.0 cnt = 0 hard_cnt = 0 - hard_acc_sum = 0. + hard_acc_sum = 0.0 for class_ in TASK_NAME_MAPPING.keys(): - acc_sum_dict[class_] = 0. - acc_norm_sum_dict[class_] = 0. - cnt_dict[class_] = 0. + acc_sum_dict[class_] = 0.0 + acc_norm_sum_dict[class_] = 0.0 + cnt_dict[class_] = 0.0 for tt in TASK_NAME_MAPPING[class_]: acc_sum += sum(res[tt]) @@ -164,13 +173,12 @@ def cal_mmlu(res): acc_sum_dict[class_] += sum(res[tt]) cnt_dict[class_] += len(res[tt]) - print('\n\n\n', 'total cnt:', cnt, '\n') + print("\n\n\n", "total cnt:", cnt, "\n") for k in TASK_NAME_MAPPING.keys(): if k in cnt_dict: - print('%s ACC: %.2f ' % ( - k, acc_sum_dict[k] / cnt_dict[k] * 100)) - print('AVERAGE ACC:%.2f ' % (acc_sum / cnt * 100)) - + print("%s ACC: %.2f " % (k, acc_sum_dict[k] / cnt_dict[k] * 100)) + print("AVERAGE ACC:%.2f " % (acc_sum / cnt * 100)) + def main(args): model, tokenizer = load_models_tokenizer(args) @@ -178,41 +186,130 @@ def main(args): dev_result = {} for subject_name in tqdm(SUBJECTS): # val_file_path = os.path.join(args.eval_data_path, 'val', f'{subject_name}_val.csv') - dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}_dev.csv') - test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}_test.csv') + dev_file_path = os.path.join( + args.eval_data_path, "dev", f"{subject_name}_dev.csv" + ) + test_file_path = os.path.join( + args.eval_data_path, "test", f"{subject_name}_test.csv" + ) # val_df = pd.read_csv(val_file_path, names=['question','A','B','C','D','answer']) - dev_df = pd.read_csv(dev_file_path, names=['question','A','B','C','D','answer']) - test_df = pd.read_csv(test_file_path, names=['question','A','B','C','D','answer']) + dev_df = pd.read_csv( + dev_file_path, names=["question", "A", "B", "C", "D", "answer"] + ) + test_df = pd.read_csv( + test_file_path, names=["question", "A", "B", "C", "D", "answer"] + ) - score = eval_subject(model, tokenizer, subject_name, test_df, dev_df=dev_df, k=5, few_shot=True, - save_result_dir=f"outs/mmlu_eval_result") + score = eval_subject( + model, + tokenizer, + subject_name, + test_df, + dev_df=dev_df, + k=5, + few_shot=True, + save_result_dir=f"outs/mmlu_eval_result", + ) dev_result[subject_name] = score cal_mmlu(dev_result) -TASK_NAME_MAPPING = {'stem': ['abstract_algebra', 'anatomy', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning'], - 'Humanities': ['formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions'], - 'other': ['business_ethics', 'college_medicine', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology', 'global_facts', 'clinical_knowledge'], - 'social': ['econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy']} +TASK_NAME_MAPPING = { + "stem": [ + "abstract_algebra", + "anatomy", + "astronomy", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_physics", + "computer_security", + "conceptual_physics", + "electrical_engineering", + "elementary_mathematics", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_mathematics", + "high_school_physics", + "high_school_statistics", + "machine_learning", + ], + "Humanities": [ + "formal_logic", + "high_school_european_history", + "high_school_us_history", + "high_school_world_history", + "international_law", + "jurisprudence", + "logical_fallacies", + "moral_disputes", + "moral_scenarios", + "philosophy", + "prehistory", + "professional_law", + "world_religions", + ], + "other": [ + "business_ethics", + "college_medicine", + "human_aging", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "nutrition", + "professional_accounting", + "professional_medicine", + "virology", + "global_facts", + "clinical_knowledge", + ], + "social": [ + "econometrics", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_microeconomics", + "high_school_psychology", + "human_sexuality", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + ], +} SUBJECTS = [v for vl in TASK_NAME_MAPPING.values() for v in vl] choices = ["A", "B", "C", "D"] -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B") - parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed') - parser.add_argument('--gpu', type=int, default=0, help='gpu id') +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=str, + help="Checkpoint path", + default="Qwen/Qwen-7B", + ) + parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed") + parser.add_argument("--gpu", type=int, default=0, help="gpu id") """Provide extra arguments required for tasks.""" - group = parser.add_argument_group(title='Evaluation options') - group.add_argument('-d', '--eval_data_path', type=str, - help='Path to eval data') - group.add_argument("--max-seq-len", type=int, default=2048, - help='Size of the output generated text.') - group.add_argument("--debug", action='store_true', default=False, - help='Print infos.') + group = parser.add_argument_group(title="Evaluation options") + group.add_argument("-d", "--eval_data_path", type=str, help="Path to eval data") + group.add_argument( + "--max-seq-len", + type=int, + default=2048, + help="Size of the output generated text.", + ) + group.add_argument( + "--debug", action="store_true", default=False, help="Print infos." + ) args = parser.parse_args() set_seed(args.seed) - main(args) \ No newline at end of file + main(args) diff --git a/eval/evaluate_plugin.py b/eval/evaluate_plugin.py index 89974ad..f3b953b 100644 --- a/eval/evaluate_plugin.py +++ b/eval/evaluate_plugin.py @@ -12,47 +12,48 @@ from transformers.generation import GenerationConfig from transformers.tools.evaluate_agent import evaluate_agent from transformers.trainer_utils import set_seed -data_root_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - 'data') +data_root_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") def is_callable(response, golden): - return response['action'].strip().lower() == golden['action'].strip( - ).lower() + return response["action"].strip().lower() == golden["action"].strip().lower() def process_res(response): # parse response - response += '\n' # fix not-find bug - thought = response[:response.find('Action:')].strip() - action = response[response.find('Action:') + - len('Action:'):response.find('Action Input:')].strip() - action_input = response[response.find('Action Input:') + - len('Action Input:'):response.find('Observation:' - )].strip() - #TODO: This parsing result is incorrect if the response contains multiple Actions. To be fixed in the future. - observation = response[response.find('Observation:') + - len('Observation:'):response.rfind('Thought:' - )].strip() - thought_last = response[response.rfind('Thought:') + - len('Thought:'):response.find('Final Answer:' - )].strip() - final_answer = response[response.find('Final Answer:') + - len('Final Answer:'):].strip() + response += "\n" # fix not-find bug + thought = response[: response.find("Action:")].strip() + action = response[ + response.find("Action:") + len("Action:") : response.find("Action Input:") + ].strip() + action_input = response[ + response.find("Action Input:") + + len("Action Input:") : response.find("Observation:") + ].strip() + # TODO: This parsing result is incorrect if the response contains multiple Actions. To be fixed in the future. + observation = response[ + response.find("Observation:") + len("Observation:") : response.rfind("Thought:") + ].strip() + thought_last = response[ + response.rfind("Thought:") + len("Thought:") : response.find("Final Answer:") + ].strip() + final_answer = response[ + response.find("Final Answer:") + len("Final Answer:") : + ].strip() try: - action_input = json.dumps(json5.loads(action_input), - ensure_ascii=False, - sort_keys=True) + action_input = json.dumps( + json5.loads(action_input), ensure_ascii=False, sort_keys=True + ) except: # print("JSON Load Error:", action_input) pass res_dict = { - 'thought': thought, - 'action': action, - 'action_input': action_input, - 'observation': observation, - 'thought_last': thought_last, - 'final_answer': final_answer + "thought": thought, + "action": action, + "action_input": action_input, + "observation": observation, + "thought_last": thought_last, + "final_answer": final_answer, } return res_dict @@ -68,20 +69,18 @@ def _get_tokenized_string(tokenizer, text_list): assert tokenizer is not None token_ids = tokenizer.encode(text) tokens_bytes = tokenizer.convert_ids_to_tokens(token_ids) - tokens = [ - token.decode('utf-8', errors='replace') for token in tokens_bytes - ] - tokenized_string = ' '.join(tokens) + tokens = [token.decode("utf-8", errors="replace") for token in tokens_bytes] + tokenized_string = " ".join(tokens) token_ids_list.append(token_ids) tokenized_string_list.append(tokenized_string) return token_ids_list, tokenized_string_list def eval_action(job): - response = job['gen'][0] - golden = job['response'] + response = job["gen"][0] + golden = job["response"] - if 'Action:' in response: + if "Action:" in response: response, golden = process_res(response), process_res(golden) if is_callable(response, golden): return True @@ -89,26 +88,29 @@ def eval_action(job): def eval_action_input(job, tokenizer): - response = job['gen'][0] - golden = job['response'] + response = job["gen"][0] + golden = job["response"] response, golden = process_res(response), process_res(golden) - query = job['prompt'] + query = job["prompt"] job = {} - job['prompt'] = query - job['gen'] = response['action_input'] - job['response'] = golden['action_input'] + job["prompt"] = query + job["gen"] = response["action_input"] + job["response"] = golden["action_input"] - job['_gen_tok'], job['_gen_tok_str'] = _get_tokenized_string( - tokenizer, [response['action_input']]) - job['_reference_tok'], job['_reference_tok_str'] = _get_tokenized_string( - tokenizer, [golden['action_input']]) + job["_gen_tok"], job["_gen_tok_str"] = _get_tokenized_string( + tokenizer, [response["action_input"]] + ) + job["_reference_tok"], job["_reference_tok_str"] = _get_tokenized_string( + tokenizer, [golden["action_input"]] + ) - scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], - tokenizer=_DummyTokenizer()) - score = scorer.score(job['_reference_tok_str'][0], job['_gen_tok_str'][0]) + scorer = rouge_scorer.RougeScorer( + ["rouge1", "rouge2", "rougeL"], tokenizer=_DummyTokenizer() + ) + score = scorer.score(job["_reference_tok_str"][0], job["_gen_tok_str"][0]) - rouge = score['rougeL'].fmeasure + rouge = score["rougeL"].fmeasure return rouge @@ -124,24 +126,33 @@ class QWenAgent(Agent): agent.run("Draw me a picture of rivers and lakes.") ``` """ - def __init__(self, - chat_prompt_template=None, - run_prompt_template=None, - additional_tools=None, - tokenizer=None, - model=None): + + def __init__( + self, + chat_prompt_template=None, + run_prompt_template=None, + additional_tools=None, + tokenizer=None, + model=None, + ): if tokenizer and model: self.tokenizer = tokenizer self.model = model else: - checkpoint = 'Qwen/Qwen-7B-Chat' + checkpoint = "Qwen/Qwen-7B-Chat" self.tokenizer = AutoTokenizer.from_pretrained( - checkpoint, trust_remote_code=True) - self.model = AutoModelForCausalLM.from_pretrained( - checkpoint, device_map='auto', - trust_remote_code=True).cuda().eval() + checkpoint, trust_remote_code=True + ) + self.model = ( + AutoModelForCausalLM.from_pretrained( + checkpoint, device_map="auto", trust_remote_code=True + ) + .cuda() + .eval() + ) self.model.generation_config = GenerationConfig.from_pretrained( - checkpoint, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参 + checkpoint, trust_remote_code=True + ) # 可指定不同的生成长度、top_p等相关超参 self.model.generation_config.do_sample = False # greedy super().__init__( @@ -152,155 +163,161 @@ class QWenAgent(Agent): def generate_one(self, prompt, stop): # "Human:" 和 "Assistant:" 曾为通义千问的特殊保留字,需要替换为 "_HUMAN_:" 和 "_ASSISTANT_:"。这一问题将在未来版本修复。 - prompt = prompt.replace('Human:', - '_HUMAN_:').replace('Assistant:', - '_ASSISTANT_:') + prompt = prompt.replace("Human:", "_HUMAN_:").replace( + "Assistant:", "_ASSISTANT_:" + ) stop = [ - item.replace('Human:', '_HUMAN_:').replace('Assistant:', - '_ASSISTANT_:') + item.replace("Human:", "_HUMAN_:").replace("Assistant:", "_ASSISTANT_:") for item in stop ] result, _ = self.model.chat(self.tokenizer, prompt, history=None) for stop_seq in stop: if result.endswith(stop_seq): - result = result[:-len(stop_seq)] + result = result[: -len(stop_seq)] - result = result.replace('_HUMAN_:', - 'Human:').replace('_ASSISTANT_:', 'Assistant:') + result = result.replace("_HUMAN_:", "Human:").replace( + "_ASSISTANT_:", "Assistant:" + ) return result def load_models_tokenizer(args): - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, - trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, - device_map='auto', - trust_remote_code=True, - bf16=True, - use_flash_attn=True).eval() + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, + device_map="auto", + trust_remote_code=True, + bf16=True, + use_flash_attn=True, + ).eval() model.generation_config = GenerationConfig.from_pretrained( - args.checkpoint_path, trust_remote_code=True) + args.checkpoint_path, trust_remote_code=True + ) model.generation_config.do_sample = False # use greedy decoding return model, tokenizer def load_jobs(filename): jobs = [] - with jsonlines.open(os.path.join(data_root_path, filename), - mode='r') as reader: + with jsonlines.open(os.path.join(data_root_path, filename), mode="r") as reader: for job in reader: jobs.append(job) return jobs def react_inference(filename, model, tokenizer): - filename_cache = filename + '.cache' + filename_cache = filename + ".cache" if os.path.exists(os.path.join(data_root_path, filename_cache)): jobs = load_jobs(filename=filename_cache) - print('Loaded from', filename_cache) + print("Loaded from", filename_cache) else: - with open(os.path.join(data_root_path, filename_cache), 'w') as f: + with open(os.path.join(data_root_path, filename_cache), "w") as f: jobs = load_jobs(filename=filename) - print('Inference:', filename) + print("Inference:", filename) for job in tqdm(jobs): - response, history = model.chat(tokenizer, - job['prompt'], - history=None) - job['gen'] = [response] - f.writelines(json.dumps(job, ensure_ascii=False) + '\n') - print(filename_cache, 'is saved.') + response, history = model.chat(tokenizer, job["prompt"], history=None) + job["gen"] = [response] + f.writelines(json.dumps(job, ensure_ascii=False) + "\n") + print(filename_cache, "is saved.") return jobs def main(args): - print('loading model weights') + print("loading model weights") if args.checkpoint_path is not None: model, tokenizer = load_models_tokenizer(args) else: model, tokenizer = None, None - print('model loaded') + print("model loaded") result = {} # eval react positive if args.eval_react_positive: - print('eval react positive ...') + print("eval react positive ...") acc_count = 0 rouge_mean = 0 - jobs = react_inference(filename=args.eval_react_positive_filename, - model=model, - tokenizer=tokenizer) + jobs = react_inference( + filename=args.eval_react_positive_filename, model=model, tokenizer=tokenizer + ) for job in jobs: if eval_action(job): acc_count += 1 rouge = eval_action_input(job, tokenizer) - rouge_mean += (rouge / len(jobs)) + rouge_mean += rouge / len(jobs) scores = { - 'action_right_rate': acc_count / len(jobs), - 'action_input_rouge': rouge_mean, + "action_right_rate": acc_count / len(jobs), + "action_input_rouge": rouge_mean, } - result.update({'react_positive': scores}) + result.update({"react_positive": scores}) # eval react negative if args.eval_react_negative: - print('eval react negative ...') + print("eval react negative ...") bad_count = 0 - jobs = react_inference(filename=args.eval_react_negative_filename, - model=model, - tokenizer=tokenizer) + jobs = react_inference( + filename=args.eval_react_negative_filename, model=model, tokenizer=tokenizer + ) for job in jobs: - if '\nAction:' in job['gen'][0]: + if "\nAction:" in job["gen"][0]: bad_count += 1 - scores = {'bad_rate': bad_count / len(jobs)} - result.update({'react_negative': scores}) + scores = {"bad_rate": bad_count / len(jobs)} + result.update({"react_negative": scores}) # eval hfagent if args.eval_hfagent: - print('eval hfagent ...') + print("eval hfagent ...") agent = QWenAgent(model=model, tokenizer=tokenizer) scores = evaluate_agent(agent, verbose=False, return_errors=False) - result.update({'hfagent': scores}) + result.update({"hfagent": scores}) pp = pprint.PrettyPrinter(indent=4) pp.pprint(result) -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument('-c', - '--checkpoint-path', - type=str, - help='Checkpoint path', - default='Qwen/Qwen-7B-Chat') - parser.add_argument('-s', - '--seed', - type=int, - default=1234, - help='Random seed') +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=str, + help="Checkpoint path", + default="Qwen/Qwen-7B-Chat", + ) + parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed") """Provide extra arguments required for tasks.""" - group = parser.add_argument_group(title='Evaluation options') - group.add_argument('--eval-react-positive', - action='store_true', - default=False, - help='Eval react positive.') - group.add_argument('--eval-react-positive-filename', - type=str, - default='exam_plugin_v1_react_positive.jsonl', - help='Eval react positive filename.') - group.add_argument('--eval-react-negative', - action='store_true', - default=False, - help='Eval react negative.') - group.add_argument('--eval-react-negative-filename', - type=str, - default='exam_plugin_v1_react_negative.jsonl', - help='Eval react negative filename.') - group.add_argument('--eval-hfagent', - action='store_true', - default=False, - help='Eval hfagent.') + group = parser.add_argument_group(title="Evaluation options") + group.add_argument( + "--eval-react-positive", + action="store_true", + default=False, + help="Eval react positive.", + ) + group.add_argument( + "--eval-react-positive-filename", + type=str, + default="exam_plugin_v1_react_positive.jsonl", + help="Eval react positive filename.", + ) + group.add_argument( + "--eval-react-negative", + action="store_true", + default=False, + help="Eval react negative.", + ) + group.add_argument( + "--eval-react-negative-filename", + type=str, + default="exam_plugin_v1_react_negative.jsonl", + help="Eval react negative filename.", + ) + group.add_argument( + "--eval-hfagent", action="store_true", default=False, help="Eval hfagent." + ) args = parser.parse_args() set_seed(args.seed) From 071a4af3658add1eeb4de15e61aab8193c6d6bb0 Mon Sep 17 00:00:00 2001 From: JustinLin610 Date: Sun, 27 Aug 2023 18:27:51 +0800 Subject: [PATCH 24/27] empty cache for reset state --- web_demo.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/web_demo.py b/web_demo.py index e94f6be..71d3257 100755 --- a/web_demo.py +++ b/web_demo.py @@ -9,6 +9,7 @@ from argparse import ArgumentParser import gradio as gr import mdtex2html +import torch.cuda from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationConfig @@ -143,9 +144,13 @@ def _launch_demo(args, model, tokenizer, config): def reset_user_input(): return gr.update(value="") - def reset_state(_task_history): + def reset_state(_chatbot, _task_history): _task_history.clear() - return [] + _chatbot.clear() + import gc + gc.collect() + torch.cuda.empty_cache() + return _chatbot with gr.Blocks() as demo: gr.Markdown("""\ @@ -174,7 +179,7 @@ Qwen-7B-Chat submit_btn.click(predict, [query, chatbot, task_history], [chatbot], show_progress=True) submit_btn.click(reset_user_input, [], [query]) - empty_btn.click(reset_state, [task_history], outputs=[chatbot], show_progress=True) + empty_btn.click(reset_state, [chatbot, task_history], outputs=[chatbot], show_progress=True) regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True) gr.Markdown("""\ From 50131acda4cbb2a03dea36be6718265af4672507 Mon Sep 17 00:00:00 2001 From: JustinLin610 Date: Sun, 27 Aug 2023 18:30:48 +0800 Subject: [PATCH 25/27] update import --- web_demo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/web_demo.py b/web_demo.py index 71d3257..922df00 100755 --- a/web_demo.py +++ b/web_demo.py @@ -9,7 +9,8 @@ from argparse import ArgumentParser import gradio as gr import mdtex2html -import torch.cuda + +import torch from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationConfig From 67034107700866640903c8501f3588f3f85dde09 Mon Sep 17 00:00:00 2001 From: JustinLin610 Date: Sun, 27 Aug 2023 18:47:09 +0800 Subject: [PATCH 26/27] update default path --- web_demo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web_demo.py b/web_demo.py index 922df00..0a0914a 100755 --- a/web_demo.py +++ b/web_demo.py @@ -14,7 +14,7 @@ import torch from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationConfig -DEFAULT_CKPT_PATH = 'QWen/QWen-7B-Chat' +DEFAULT_CKPT_PATH = 'Qwen/Qwen-7B-Chat' def _get_args(): From 09f189449c1cb76177be4cf41f18d3943a03c096 Mon Sep 17 00:00:00 2001 From: JustinLin610 Date: Sun, 27 Aug 2023 18:49:11 +0800 Subject: [PATCH 27/27] update default path --- web_demo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/web_demo.py b/web_demo.py index 0a0914a..11d7a56 100755 --- a/web_demo.py +++ b/web_demo.py @@ -14,6 +14,7 @@ import torch from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationConfig + DEFAULT_CKPT_PATH = 'Qwen/Qwen-7B-Chat'