From 4864f7b27849398c975be8ea06024f0e313fc7f4 Mon Sep 17 00:00:00 2001 From: "feihu.hf" Date: Fri, 25 Aug 2023 22:44:07 +0800 Subject: [PATCH] fix format problems in evaluation code; update ceval extraction rules --- eval/EVALUATION.md | 13 + eval/evaluate_ceval.py | 407 ++++++++++++++++++++++---------- eval/evaluate_chat_ceval.py | 387 +++++++++++++++++++++--------- eval/evaluate_chat_gsm8k.py | 108 +++++---- eval/evaluate_chat_humaneval.py | 97 +++++--- eval/evaluate_chat_mmlu.py | 247 +++++++++++++------ eval/evaluate_cmmlu.py | 320 ++++++++++++++----------- eval/evaluate_gsm8k.py | 91 ++++--- eval/evaluate_humaneval.py | 75 +++--- eval/evaluate_mmlu.py | 275 ++++++++++++++------- eval/evaluate_plugin.py | 301 ++++++++++++----------- 11 files changed, 1510 insertions(+), 811 deletions(-) diff --git a/eval/EVALUATION.md b/eval/EVALUATION.md index 44e0af6..1381e69 100644 --- a/eval/EVALUATION.md +++ b/eval/EVALUATION.md @@ -34,6 +34,19 @@ pip install thefuzz python evaluate_chat_mmlu.py -d data/mmlu/data/ ``` +- CMMLU + +```Shell +wget https://huggingface.co/datasets/haonan-li/cmmlu/resolve/main/cmmlu_v1_0_1.zip +mkdir data/cmmlu +mv cmmlu_v1_0_1.zip data/cmmlu +cd data/cmmlu; unzip cmmlu_v1_0_1.zip +cd ../../ + +# Qwen-7B +python evaluate_cmmlu.py -d data/cmmlu/ +``` + - HumanEval Get the HumanEval.jsonl file from [here](https://github.com/openai/human-eval/tree/master/data) diff --git a/eval/evaluate_ceval.py b/eval/evaluate_ceval.py index e1616a5..a6618cf 100644 --- a/eval/evaluate_ceval.py +++ b/eval/evaluate_ceval.py @@ -1,14 +1,13 @@ import os -import pandas as pd -import numpy as np +from typing import List import argparse -import datasets import torch - -from typing import List +import pandas as pd +import numpy as np from tqdm import tqdm from transformers.trainer_utils import set_seed - +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers.generation import GenerationConfig ''' wget https://huggingface.co/datasets/ceval/ceval-exam/resolve/main/ceval-exam.zip @@ -20,29 +19,32 @@ python evaluate_ceval.py -d data/ceval/ ''' def load_models_tokenizer(args): - from transformers import AutoModelForCausalLM, AutoTokenizer - from transformers.generation import GenerationConfig - - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval() - model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, device_map="auto", trust_remote_code=True + ).eval() + model.generation_config = GenerationConfig.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) return model, tokenizer def format_example(line, include_answer=True): - example = '问题:' + line['question'] + example = "问题:" + line["question"] for choice in choices: example += f'\n{choice}. {line[f"{choice}"]}' - + if include_answer: - example += '\n答案:' + line["answer"] + '\n\n' + example += "\n答案:" + line["answer"] + "\n\n" else: - example += '\n答案:' + example += "\n答案:" return example def generate_few_shot_prompt(k, subject, dev_df): - prompt = '' + prompt = "" if k == -1: k = dev_df.shape[0] for i in range(k): @@ -54,35 +56,37 @@ def generate_few_shot_prompt(k, subject, dev_df): def get_logits(tokenizer, model, inputs: List[str]): - input_ids = tokenizer(inputs, padding=False)['input_ids'] + input_ids = tokenizer(inputs, padding=False)["input_ids"] input_ids = torch.tensor(input_ids, device=model.device) - tokens = {'input_ids': input_ids} + tokens = {"input_ids": input_ids} - outputs = model(input_ids)['logits'] + outputs = model(input_ids)["logits"] logits = outputs[:, -1, :] log_probs = torch.nn.functional.softmax(logits, dim=-1) - return log_probs, {'tokens': tokens} + return log_probs, {"tokens": tokens} @torch.no_grad() def eval_subject( - model, - tokenizer, - subject_name, - test_df, - k=5, - dev_df=None, - few_shot=False, - save_result_dir=None, - **kwargs + model, + tokenizer, + subject_name, + test_df, + k=5, + dev_df=None, + few_shot=False, + save_result_dir=None, + **kwargs, ): result = [] score = [] - few_shot_prompt = generate_few_shot_prompt( - k, subject_name, dev_df) if few_shot else '' - all_probs = {'prob_A': [], 'prob_B': [], 'prob_C': [], 'prob_D': []} - if args.debug: print(f"few_shot_prompt: {few_shot_prompt}") + few_shot_prompt = ( + generate_few_shot_prompt(k, subject_name, dev_df) if few_shot else "" + ) + all_probs = {"prob_A": [], "prob_B": [], "prob_C": [], "prob_D": []} + if args.debug: + print(f"few_shot_prompt: {few_shot_prompt}") for _, row in tqdm(test_df.iterrows(), total=len(test_df)): question = format_example(row, include_answer=False) @@ -93,44 +97,49 @@ def eval_subject( logits = output.flatten() softval = torch.nn.functional.softmax( - torch.tensor( - [ - logits[tokenizer("A")['input_ids']], - logits[tokenizer("B")['input_ids']], - logits[tokenizer("C")['input_ids']], - logits[tokenizer("D")['input_ids']], - ] - ), - dim=0, - ) + torch.tensor( + [ + logits[tokenizer("A")["input_ids"]], + logits[tokenizer("B")["input_ids"]], + logits[tokenizer("C")["input_ids"]], + logits[tokenizer("D")["input_ids"]], + ] + ), + dim=0, + ) if softval.dtype in {torch.bfloat16, torch.float16}: softval = softval.to(dtype=torch.float32) probs = softval.detach().cpu().numpy() for i, choice in enumerate(choices): - all_probs[f'prob_{choice}'].append(probs[i]) + all_probs[f"prob_{choice}"].append(probs[i]) pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)] - - if 'answer' in row: - correct = 1 if pred == row['answer'] else 0 + + if "answer" in row: + correct = 1 if pred == row["answer"] else 0 score.append(correct) - if args.debug: print(f'{question} pred: {pred} ref: {row["answer"]}') + if args.debug: + print(f'{question} pred: {pred} ref: {row["answer"]}') result.append(pred) if score: correct_ratio = 100 * sum(score) / len(score) - if args.debug: print(subject_name, correct_ratio) + if args.debug: + print(subject_name, correct_ratio) else: correct_ratio = 0 if save_result_dir: - test_df['model_output'] = result + test_df["model_output"] = result for i, choice in enumerate(choices): - test_df[f'prob_{choice}'] = (all_probs[f'prob_{choice}']) + test_df[f"prob_{choice}"] = all_probs[f"prob_{choice}"] if score: test_df["correctness"] = score os.makedirs(save_result_dir, exist_ok=True) - test_df.to_csv(os.path.join( - save_result_dir, f'{subject_name}_result.csv'), encoding="utf-8", index=False) + test_df.to_csv( + os.path.join(save_result_dir, f"{subject_name}_result.csv"), + encoding="utf-8", + index=False, + ) return correct_ratio @@ -139,125 +148,285 @@ def cal_ceval(res): acc_sum_dict = dict() acc_norm_sum_dict = dict() cnt_dict = dict() - acc_sum = 0. + acc_sum = 0.0 cnt = 0 hard_cnt = 0 - hard_acc_sum = 0. + hard_acc_sum = 0.0 for tt in res.keys(): - name = tt.split('-')[-1] + name = tt.split("-")[-1] acc_sum += float(res[tt]) cnt += 1 class_ = TASK_NAME_MAPPING[name][2] if class_ not in acc_sum_dict: - acc_sum_dict[class_] = 0. - acc_norm_sum_dict[class_] = 0. - cnt_dict[class_] = 0. + acc_sum_dict[class_] = 0.0 + acc_norm_sum_dict[class_] = 0.0 + cnt_dict[class_] = 0.0 if name in hard_list: hard_cnt += 1 hard_acc_sum += float(res[tt]) acc_sum_dict[class_] += float(res[tt]) cnt_dict[class_] += 1 - print('\n\n\n') - for k in ['STEM', 'Social Science', 'Humanities', 'Other']: + print("\n\n\n") + for k in ["STEM", "Social Science", "Humanities", "Other"]: if k in cnt_dict: - print('%s acc: %.2f ' % ( - k, acc_sum_dict[k] / cnt_dict[k])) + print("%s acc: %.2f " % (k, acc_sum_dict[k] / cnt_dict[k])) if hard_cnt > 0: - print('Hard acc:%.2f ' % (hard_acc_sum / hard_cnt)) - print('AVERAGE acc:%.2f ' % (acc_sum / cnt)) + print("Hard acc:%.2f " % (hard_acc_sum / hard_cnt)) + print("AVERAGE acc:%.2f " % (acc_sum / cnt)) TASK_NAME_MAPPING = { "computer_network": ["Computer Network", "\u8ba1\u7b97\u673a\u7f51\u7edc", "STEM"], "operating_system": ["Operating System", "\u64cd\u4f5c\u7cfb\u7edf", "STEM"], - "computer_architecture": ["Computer Architecture", "\u8ba1\u7b97\u673a\u7ec4\u6210", "STEM"], + "computer_architecture": [ + "Computer Architecture", + "\u8ba1\u7b97\u673a\u7ec4\u6210", + "STEM", + ], "college_programming": ["College Programming", "\u5927\u5b66\u7f16\u7a0b", "STEM"], "college_physics": ["College Physics", "\u5927\u5b66\u7269\u7406", "STEM"], "college_chemistry": ["College Chemistry", "\u5927\u5b66\u5316\u5b66", "STEM"], - "advanced_mathematics": ["Advanced Mathematics", "\u9ad8\u7b49\u6570\u5b66", "STEM"], - "probability_and_statistics": ["Probability and Statistics", "\u6982\u7387\u7edf\u8ba1", "STEM"], - "discrete_mathematics": ["Discrete Mathematics", "\u79bb\u6563\u6570\u5b66", "STEM"], - "electrical_engineer": ["Electrical Engineer", "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08", "STEM"], - "metrology_engineer": ["Metrology Engineer", "\u6ce8\u518c\u8ba1\u91cf\u5e08", "STEM"], - "high_school_mathematics": ["High School Mathematics", "\u9ad8\u4e2d\u6570\u5b66", "STEM"], + "advanced_mathematics": [ + "Advanced Mathematics", + "\u9ad8\u7b49\u6570\u5b66", + "STEM", + ], + "probability_and_statistics": [ + "Probability and Statistics", + "\u6982\u7387\u7edf\u8ba1", + "STEM", + ], + "discrete_mathematics": [ + "Discrete Mathematics", + "\u79bb\u6563\u6570\u5b66", + "STEM", + ], + "electrical_engineer": [ + "Electrical Engineer", + "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08", + "STEM", + ], + "metrology_engineer": [ + "Metrology Engineer", + "\u6ce8\u518c\u8ba1\u91cf\u5e08", + "STEM", + ], + "high_school_mathematics": [ + "High School Mathematics", + "\u9ad8\u4e2d\u6570\u5b66", + "STEM", + ], "high_school_physics": ["High School Physics", "\u9ad8\u4e2d\u7269\u7406", "STEM"], - "high_school_chemistry": ["High School Chemistry", "\u9ad8\u4e2d\u5316\u5b66", "STEM"], + "high_school_chemistry": [ + "High School Chemistry", + "\u9ad8\u4e2d\u5316\u5b66", + "STEM", + ], "high_school_biology": ["High School Biology", "\u9ad8\u4e2d\u751f\u7269", "STEM"], - "middle_school_mathematics": ["Middle School Mathematics", "\u521d\u4e2d\u6570\u5b66", "STEM"], - "middle_school_biology": ["Middle School Biology", "\u521d\u4e2d\u751f\u7269", "STEM"], - "middle_school_physics": ["Middle School Physics", "\u521d\u4e2d\u7269\u7406", "STEM"], - "middle_school_chemistry": ["Middle School Chemistry", "\u521d\u4e2d\u5316\u5b66", "STEM"], + "middle_school_mathematics": [ + "Middle School Mathematics", + "\u521d\u4e2d\u6570\u5b66", + "STEM", + ], + "middle_school_biology": [ + "Middle School Biology", + "\u521d\u4e2d\u751f\u7269", + "STEM", + ], + "middle_school_physics": [ + "Middle School Physics", + "\u521d\u4e2d\u7269\u7406", + "STEM", + ], + "middle_school_chemistry": [ + "Middle School Chemistry", + "\u521d\u4e2d\u5316\u5b66", + "STEM", + ], "veterinary_medicine": ["Veterinary Medicine", "\u517d\u533b\u5b66", "STEM"], - "college_economics": ["College Economics", "\u5927\u5b66\u7ecf\u6d4e\u5b66", "Social Science"], - "business_administration": ["Business Administration", "\u5de5\u5546\u7ba1\u7406", "Social Science"], - "marxism": ["Marxism", "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406", "Social Science"], - "mao_zedong_thought": ["Mao Zedong Thought", "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba", "Social Science"], + "college_economics": [ + "College Economics", + "\u5927\u5b66\u7ecf\u6d4e\u5b66", + "Social Science", + ], + "business_administration": [ + "Business Administration", + "\u5de5\u5546\u7ba1\u7406", + "Social Science", + ], + "marxism": [ + "Marxism", + "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406", + "Social Science", + ], + "mao_zedong_thought": [ + "Mao Zedong Thought", + "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba", + "Social Science", + ], "education_science": ["Education Science", "\u6559\u80b2\u5b66", "Social Science"], - "teacher_qualification": ["Teacher Qualification", "\u6559\u5e08\u8d44\u683c", "Social Science"], - "high_school_politics": ["High School Politics", "\u9ad8\u4e2d\u653f\u6cbb", "Social Science"], - "high_school_geography": ["High School Geography", "\u9ad8\u4e2d\u5730\u7406", "Social Science"], - "middle_school_politics": ["Middle School Politics", "\u521d\u4e2d\u653f\u6cbb", "Social Science"], - "middle_school_geography": ["Middle School Geography", "\u521d\u4e2d\u5730\u7406", "Social Science"], - "modern_chinese_history": ["Modern Chinese History", "\u8fd1\u4ee3\u53f2\u7eb2\u8981", "Humanities"], - "ideological_and_moral_cultivation": ["Ideological and Moral Cultivation", "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840", "Humanities"], + "teacher_qualification": [ + "Teacher Qualification", + "\u6559\u5e08\u8d44\u683c", + "Social Science", + ], + "high_school_politics": [ + "High School Politics", + "\u9ad8\u4e2d\u653f\u6cbb", + "Social Science", + ], + "high_school_geography": [ + "High School Geography", + "\u9ad8\u4e2d\u5730\u7406", + "Social Science", + ], + "middle_school_politics": [ + "Middle School Politics", + "\u521d\u4e2d\u653f\u6cbb", + "Social Science", + ], + "middle_school_geography": [ + "Middle School Geography", + "\u521d\u4e2d\u5730\u7406", + "Social Science", + ], + "modern_chinese_history": [ + "Modern Chinese History", + "\u8fd1\u4ee3\u53f2\u7eb2\u8981", + "Humanities", + ], + "ideological_and_moral_cultivation": [ + "Ideological and Moral Cultivation", + "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840", + "Humanities", + ], "logic": ["Logic", "\u903b\u8f91\u5b66", "Humanities"], "law": ["Law", "\u6cd5\u5b66", "Humanities"], - "chinese_language_and_literature": ["Chinese Language and Literature", "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66", "Humanities"], + "chinese_language_and_literature": [ + "Chinese Language and Literature", + "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66", + "Humanities", + ], "art_studies": ["Art Studies", "\u827a\u672f\u5b66", "Humanities"], - "professional_tour_guide": ["Professional Tour Guide", "\u5bfc\u6e38\u8d44\u683c", "Humanities"], - "legal_professional": ["Legal Professional", "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c", "Humanities"], - "high_school_chinese": ["High School Chinese", "\u9ad8\u4e2d\u8bed\u6587", "Humanities"], - "high_school_history": ["High School History", "\u9ad8\u4e2d\u5386\u53f2", "Humanities"], - "middle_school_history": ["Middle School History", "\u521d\u4e2d\u5386\u53f2", "Humanities"], + "professional_tour_guide": [ + "Professional Tour Guide", + "\u5bfc\u6e38\u8d44\u683c", + "Humanities", + ], + "legal_professional": [ + "Legal Professional", + "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c", + "Humanities", + ], + "high_school_chinese": [ + "High School Chinese", + "\u9ad8\u4e2d\u8bed\u6587", + "Humanities", + ], + "high_school_history": [ + "High School History", + "\u9ad8\u4e2d\u5386\u53f2", + "Humanities", + ], + "middle_school_history": [ + "Middle School History", + "\u521d\u4e2d\u5386\u53f2", + "Humanities", + ], "civil_servant": ["Civil Servant", "\u516c\u52a1\u5458", "Other"], "sports_science": ["Sports Science", "\u4f53\u80b2\u5b66", "Other"], "plant_protection": ["Plant Protection", "\u690d\u7269\u4fdd\u62a4", "Other"], "basic_medicine": ["Basic Medicine", "\u57fa\u7840\u533b\u5b66", "Other"], "clinical_medicine": ["Clinical Medicine", "\u4e34\u5e8a\u533b\u5b66", "Other"], - "urban_and_rural_planner": ["Urban and Rural Planner", "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08", "Other"], + "urban_and_rural_planner": [ + "Urban and Rural Planner", + "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08", + "Other", + ], "accountant": ["Accountant", "\u6ce8\u518c\u4f1a\u8ba1\u5e08", "Other"], - "fire_engineer": ["Fire Engineer", "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08", "Other"], - "environmental_impact_assessment_engineer": ["Environmental Impact Assessment Engineer", "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08", "Other"], + "fire_engineer": [ + "Fire Engineer", + "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08", + "Other", + ], + "environmental_impact_assessment_engineer": [ + "Environmental Impact Assessment Engineer", + "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08", + "Other", + ], "tax_accountant": ["Tax Accountant", "\u7a0e\u52a1\u5e08", "Other"], - "physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"] + "physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"], } -hard_list = ['advanced_mathematics', 'discrete_mathematics', 'probability_and_statistics', 'college_physics', 'college_chemistry', 'high_school_mathematics', 'high_school_physics', 'high_school_chemistry'] +hard_list = [ + "advanced_mathematics", + "discrete_mathematics", + "probability_and_statistics", + "college_physics", + "college_chemistry", + "high_school_mathematics", + "high_school_physics", + "high_school_chemistry", +] choices = ["A", "B", "C", "D"] def main(args): model, tokenizer = load_models_tokenizer(args) - + dev_result = {} for subject_name in tqdm(TASK_NAME_MAPPING.keys()): - val_file_path = os.path.join(args.eval_data_path, 'val', f'{subject_name}_val.csv') - dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}_dev.csv') + val_file_path = os.path.join( + args.eval_data_path, "val", f"{subject_name}_val.csv" + ) + dev_file_path = os.path.join( + args.eval_data_path, "dev", f"{subject_name}_dev.csv" + ) # test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}_test.csv') val_df = pd.read_csv(val_file_path) dev_df = pd.read_csv(dev_file_path) # test_df = pd.read_csv(test_file_path) - score = eval_subject(model, tokenizer, subject_name, val_df, dev_df=dev_df, k=5, few_shot=True, - save_result_dir=f"outs/ceval_eval_result") + score = eval_subject( + model, + tokenizer, + subject_name, + val_df, + dev_df=dev_df, + k=5, + few_shot=True, + save_result_dir=f"outs/ceval_eval_result", + ) dev_result[subject_name] = score cal_ceval(dev_result) -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B") - parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed') - - """Provide extra arguments required for tasks.""" - group = parser.add_argument_group(title='Evaluation options') - group.add_argument('-d', '--eval_data_path', type=str, required=True, - help='Path to eval data') - group.add_argument("--max-seq-len", type=int, default=2048, - help='Size of the output generated text.') - group.add_argument("--debug", action='store_true', default=False, - help='Print infos.') +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=str, + help="Checkpoint path", + default="Qwen/Qwen-7B", + ) + parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed") + + # Provide extra arguments required for tasks + group = parser.add_argument_group(title="Evaluation options") + group.add_argument( + "-d", "--eval_data_path", type=str, required=True, help="Path to eval data" + ) + group.add_argument( + "--max-seq-len", + type=int, + default=2048, + help="Size of the output generated text.", + ) + group.add_argument( + "--debug", action="store_true", default=False, help="Print infos." + ) args = parser.parse_args() set_seed(args.seed) - main(args) \ No newline at end of file + main(args) diff --git a/eval/evaluate_chat_ceval.py b/eval/evaluate_chat_ceval.py index 10d5b27..b909a6d 100644 --- a/eval/evaluate_chat_ceval.py +++ b/eval/evaluate_chat_ceval.py @@ -1,14 +1,13 @@ import os -import pandas as pd -import numpy as np import argparse -import datasets -import torch import re +import torch +import pandas as pd from thefuzz import process -from typing import List from tqdm import tqdm from transformers.trainer_utils import set_seed +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers.generation import GenerationConfig ''' wget https://huggingface.co/datasets/ceval/ceval-exam/resolve/main/ceval-exam.zip @@ -22,13 +21,16 @@ python eval/evaluate_chat_ceval.py -d data/ceval ''' def load_models_tokenizer(args): - from transformers import AutoModelForCausalLM, AutoTokenizer - from transformers.generation import GenerationConfig - - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True, bf16=True, use_flash_attn=True).eval() - model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) - model.generation_config.do_sample = False # use greedy decoding + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, device_map="auto", trust_remote_code=True + ).eval() + model.generation_config = GenerationConfig.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model.generation_config.do_sample = False # use greedy decoding return model, tokenizer def process_before_extraction(gen, question, choice_dict): @@ -57,20 +59,28 @@ def process_before_extraction(gen, question, choice_dict): gen = gen.replace(val.rstrip("。"), key) return gen + def count_substr(gen, pattern): return len(re.findall(pattern, gen)) + def extract_choice(gen, prompt, choice_list): # 答案是A | 选项是A | 应该选A选项 - res = re.search(r"(?:(?:选|选择|选定)|(?:(?:答案|选项)(?![^ABCD]{0,10}?(?:不|非)[^ABCD]{0,10}?(?:是|为|:|:|】))[^ABCD]{0,10}?(?:是|为|:|:|】))[^ABCD]{0,10}?)(A|B|C|D)(?:选项)?(?:\)|。|\.|,|,|.|、|A|B|C|D|$)", gen) - + res = re.search( + r"(?:(?:选|选择|选定)[::]?\s*|(?:(?:答案|选项)(?![^ABCD]{0,10}?(?:不|非)[^ABCD]{0,10}?(?:是|选|为|:|:|】))[^ABCD]{0,10}?(?:是|选|为|:|:|】))[^ABCD]{0,10}?)(A|B|C|D)(?:选项)?(?:\)|。|\.|,|,|.|、|A|B|C|D|$|:|:|\)|))", + gen, + ) + # A选项正确 | A选项符合题意 if res is None: - res = re.search(r"(A|B|C|D)(?:选?项)?(?![^ABCD]{0,4}?(?:不|非)[^ABCD]{0,4}?(?:正确|对|符合))[^ABCD]{0,4}?(?:正确|对|符合)", gen) + res = re.search( + r"(A|B|C|D)(?:选?项)?(?![^ABCD]{0,4}?(?:不|非)[^ABCD]{0,4}?(?:正确|对[的,。:]|符合))[^ABCD]{0,4}?(?:正确|对[的,。:]|符合)", + gen, + ) # 直接输出 A if res is None: - res = re.search(r"^(A|B|C|D)(?:。|\.|,|,|.|$)", gen) + res = re.search(r"^[\((]?(A|B|C|D)(?:。|\)|)|\.|,|,|.|:|:|$)", gen) # 获取第一个出现的字母 if res is None: @@ -78,41 +88,46 @@ def extract_choice(gen, prompt, choice_list): if res is None: return choices[choice_list.index(process.extractOne(gen, choice_list)[0])] - else: - return res.group(1) + return res.group(1) + def format_example(line): - example = line['question'] + "\n\n" + example = line["question"] + "\n\n" for choice in choices: - example += f'{choice}. {line[f"{choice}"]}\n' + example += f'{choice}. {line[f"{choice}"]}\n' return example + def extract_answer(response, row): - prompt = row['question'] - gen = process_before_extraction(response, prompt, {choice: row[choice] for choice in choices}) + prompt = row["question"] + gen = process_before_extraction( + response, prompt, {choice: row[choice] for choice in choices} + ) if not isinstance(prompt, str): prompt = prompt[0] pred = extract_choice(gen, prompt, [row[choice] for choice in choices]) return pred + @torch.no_grad() def eval_subject( - model, - tokenizer, - subject_name, - test_df, - save_result_dir=None, - overwrite=False, - **kwargs + model, + tokenizer, + subject_name, + test_df, + save_result_dir=None, + overwrite=False, + **kwargs ): - - result_path = os.path.join(save_result_dir, f'{subject_name}_result.csv') + result_path = os.path.join(save_result_dir, f"{subject_name}_result.csv") if not overwrite and os.path.exists(result_path): print(f"{result_path} existed, skip!") score = [] - for (_, datarow), (_, resultrow) in zip(test_df.iterrows(), pd.read_csv(result_path).iterrows()): - pred = extract_answer(resultrow['model_response'], datarow) - correct = 1 if pred == datarow['answer'] else 0 + for (_, datarow), (_, resultrow) in zip( + test_df.iterrows(), pd.read_csv(result_path).iterrows() + ): + pred = extract_answer(resultrow["model_response"], datarow) + correct = 1 if pred == datarow["answer"] else 0 score.append(correct) correct_ratio = 100 * sum(score) / len(score) return correct_ratio @@ -124,7 +139,7 @@ def eval_subject( for _, row in tqdm(test_df.iterrows(), total=len(test_df)): question = format_example(row) - response, history = model.chat( + response, _ = model.chat( tokenizer, question, history=None, @@ -134,22 +149,24 @@ def eval_subject( pred = extract_answer(response, row) print(pred) print("======================") - - if 'answer' in row: - correct = 1 if pred == row['answer'] else 0 + + if "answer" in row: + correct = 1 if pred == row["answer"] else 0 score.append(correct) - if args.debug: print(f'{question} pred: {pred} ref: {row["answer"]}') + if args.debug: + print(f'{question} pred: {pred} ref: {row["answer"]}') responses.append(response) result.append(pred) if score: correct_ratio = 100 * sum(score) / len(score) - if args.debug: print(subject_name, correct_ratio) + if args.debug: + print(subject_name, correct_ratio) else: correct_ratio = 0 if save_result_dir: - test_df['model_response'] = responses - test_df['model_output'] = result + test_df["model_response"] = responses + test_df["model_output"] = result if score: test_df["correctness"] = score os.makedirs(save_result_dir, exist_ok=True) @@ -162,89 +179,225 @@ def cal_ceval(res): acc_sum_dict = dict() acc_norm_sum_dict = dict() cnt_dict = dict() - acc_sum = 0. + acc_sum = 0.0 cnt = 0 hard_cnt = 0 - hard_acc_sum = 0. + hard_acc_sum = 0.0 for tt in res.keys(): - name = tt.split('-')[-1] + name = tt.split("-")[-1] acc_sum += float(res[tt]) cnt += 1 class_ = TASK_NAME_MAPPING[name][2] if class_ not in acc_sum_dict: - acc_sum_dict[class_] = 0. - acc_norm_sum_dict[class_] = 0. - cnt_dict[class_] = 0. + acc_sum_dict[class_] = 0.0 + acc_norm_sum_dict[class_] = 0.0 + cnt_dict[class_] = 0.0 if name in hard_list: hard_cnt += 1 hard_acc_sum += float(res[tt]) acc_sum_dict[class_] += float(res[tt]) cnt_dict[class_] += 1 - print('\n\n\n') - for k in ['STEM', 'Social Science', 'Humanities', 'Other']: + print("\n\n\n") + for k in ["STEM", "Social Science", "Humanities", "Other"]: if k in cnt_dict: - print('%s acc: %.2f ' % ( - k, acc_sum_dict[k] / cnt_dict[k])) + print("%s acc: %.2f " % (k, acc_sum_dict[k] / cnt_dict[k])) if hard_cnt > 0: - print('Hard acc:%.2f ' % (hard_acc_sum / hard_cnt)) - print('AVERAGE acc:%.2f ' % (acc_sum / cnt)) + print("Hard acc:%.2f " % (hard_acc_sum / hard_cnt)) + print("AVERAGE acc:%.2f " % (acc_sum / cnt)) TASK_NAME_MAPPING = { "computer_network": ["Computer Network", "\u8ba1\u7b97\u673a\u7f51\u7edc", "STEM"], "operating_system": ["Operating System", "\u64cd\u4f5c\u7cfb\u7edf", "STEM"], - "computer_architecture": ["Computer Architecture", "\u8ba1\u7b97\u673a\u7ec4\u6210", "STEM"], + "computer_architecture": [ + "Computer Architecture", + "\u8ba1\u7b97\u673a\u7ec4\u6210", + "STEM", + ], "college_programming": ["College Programming", "\u5927\u5b66\u7f16\u7a0b", "STEM"], "college_physics": ["College Physics", "\u5927\u5b66\u7269\u7406", "STEM"], "college_chemistry": ["College Chemistry", "\u5927\u5b66\u5316\u5b66", "STEM"], - "advanced_mathematics": ["Advanced Mathematics", "\u9ad8\u7b49\u6570\u5b66", "STEM"], - "probability_and_statistics": ["Probability and Statistics", "\u6982\u7387\u7edf\u8ba1", "STEM"], - "discrete_mathematics": ["Discrete Mathematics", "\u79bb\u6563\u6570\u5b66", "STEM"], - "electrical_engineer": ["Electrical Engineer", "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08", "STEM"], - "metrology_engineer": ["Metrology Engineer", "\u6ce8\u518c\u8ba1\u91cf\u5e08", "STEM"], - "high_school_mathematics": ["High School Mathematics", "\u9ad8\u4e2d\u6570\u5b66", "STEM"], + "advanced_mathematics": [ + "Advanced Mathematics", + "\u9ad8\u7b49\u6570\u5b66", + "STEM", + ], + "probability_and_statistics": [ + "Probability and Statistics", + "\u6982\u7387\u7edf\u8ba1", + "STEM", + ], + "discrete_mathematics": [ + "Discrete Mathematics", + "\u79bb\u6563\u6570\u5b66", + "STEM", + ], + "electrical_engineer": [ + "Electrical Engineer", + "\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08", + "STEM", + ], + "metrology_engineer": [ + "Metrology Engineer", + "\u6ce8\u518c\u8ba1\u91cf\u5e08", + "STEM", + ], + "high_school_mathematics": [ + "High School Mathematics", + "\u9ad8\u4e2d\u6570\u5b66", + "STEM", + ], "high_school_physics": ["High School Physics", "\u9ad8\u4e2d\u7269\u7406", "STEM"], - "high_school_chemistry": ["High School Chemistry", "\u9ad8\u4e2d\u5316\u5b66", "STEM"], + "high_school_chemistry": [ + "High School Chemistry", + "\u9ad8\u4e2d\u5316\u5b66", + "STEM", + ], "high_school_biology": ["High School Biology", "\u9ad8\u4e2d\u751f\u7269", "STEM"], - "middle_school_mathematics": ["Middle School Mathematics", "\u521d\u4e2d\u6570\u5b66", "STEM"], - "middle_school_biology": ["Middle School Biology", "\u521d\u4e2d\u751f\u7269", "STEM"], - "middle_school_physics": ["Middle School Physics", "\u521d\u4e2d\u7269\u7406", "STEM"], - "middle_school_chemistry": ["Middle School Chemistry", "\u521d\u4e2d\u5316\u5b66", "STEM"], + "middle_school_mathematics": [ + "Middle School Mathematics", + "\u521d\u4e2d\u6570\u5b66", + "STEM", + ], + "middle_school_biology": [ + "Middle School Biology", + "\u521d\u4e2d\u751f\u7269", + "STEM", + ], + "middle_school_physics": [ + "Middle School Physics", + "\u521d\u4e2d\u7269\u7406", + "STEM", + ], + "middle_school_chemistry": [ + "Middle School Chemistry", + "\u521d\u4e2d\u5316\u5b66", + "STEM", + ], "veterinary_medicine": ["Veterinary Medicine", "\u517d\u533b\u5b66", "STEM"], - "college_economics": ["College Economics", "\u5927\u5b66\u7ecf\u6d4e\u5b66", "Social Science"], - "business_administration": ["Business Administration", "\u5de5\u5546\u7ba1\u7406", "Social Science"], - "marxism": ["Marxism", "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406", "Social Science"], - "mao_zedong_thought": ["Mao Zedong Thought", "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba", "Social Science"], + "college_economics": [ + "College Economics", + "\u5927\u5b66\u7ecf\u6d4e\u5b66", + "Social Science", + ], + "business_administration": [ + "Business Administration", + "\u5de5\u5546\u7ba1\u7406", + "Social Science", + ], + "marxism": [ + "Marxism", + "\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406", + "Social Science", + ], + "mao_zedong_thought": [ + "Mao Zedong Thought", + "\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba", + "Social Science", + ], "education_science": ["Education Science", "\u6559\u80b2\u5b66", "Social Science"], - "teacher_qualification": ["Teacher Qualification", "\u6559\u5e08\u8d44\u683c", "Social Science"], - "high_school_politics": ["High School Politics", "\u9ad8\u4e2d\u653f\u6cbb", "Social Science"], - "high_school_geography": ["High School Geography", "\u9ad8\u4e2d\u5730\u7406", "Social Science"], - "middle_school_politics": ["Middle School Politics", "\u521d\u4e2d\u653f\u6cbb", "Social Science"], - "middle_school_geography": ["Middle School Geography", "\u521d\u4e2d\u5730\u7406", "Social Science"], - "modern_chinese_history": ["Modern Chinese History", "\u8fd1\u4ee3\u53f2\u7eb2\u8981", "Humanities"], - "ideological_and_moral_cultivation": ["Ideological and Moral Cultivation", "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840", "Humanities"], + "teacher_qualification": [ + "Teacher Qualification", + "\u6559\u5e08\u8d44\u683c", + "Social Science", + ], + "high_school_politics": [ + "High School Politics", + "\u9ad8\u4e2d\u653f\u6cbb", + "Social Science", + ], + "high_school_geography": [ + "High School Geography", + "\u9ad8\u4e2d\u5730\u7406", + "Social Science", + ], + "middle_school_politics": [ + "Middle School Politics", + "\u521d\u4e2d\u653f\u6cbb", + "Social Science", + ], + "middle_school_geography": [ + "Middle School Geography", + "\u521d\u4e2d\u5730\u7406", + "Social Science", + ], + "modern_chinese_history": [ + "Modern Chinese History", + "\u8fd1\u4ee3\u53f2\u7eb2\u8981", + "Humanities", + ], + "ideological_and_moral_cultivation": [ + "Ideological and Moral Cultivation", + "\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840", + "Humanities", + ], "logic": ["Logic", "\u903b\u8f91\u5b66", "Humanities"], "law": ["Law", "\u6cd5\u5b66", "Humanities"], - "chinese_language_and_literature": ["Chinese Language and Literature", "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66", "Humanities"], + "chinese_language_and_literature": [ + "Chinese Language and Literature", + "\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66", + "Humanities", + ], "art_studies": ["Art Studies", "\u827a\u672f\u5b66", "Humanities"], - "professional_tour_guide": ["Professional Tour Guide", "\u5bfc\u6e38\u8d44\u683c", "Humanities"], - "legal_professional": ["Legal Professional", "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c", "Humanities"], - "high_school_chinese": ["High School Chinese", "\u9ad8\u4e2d\u8bed\u6587", "Humanities"], - "high_school_history": ["High School History", "\u9ad8\u4e2d\u5386\u53f2", "Humanities"], - "middle_school_history": ["Middle School History", "\u521d\u4e2d\u5386\u53f2", "Humanities"], + "professional_tour_guide": [ + "Professional Tour Guide", + "\u5bfc\u6e38\u8d44\u683c", + "Humanities", + ], + "legal_professional": [ + "Legal Professional", + "\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c", + "Humanities", + ], + "high_school_chinese": [ + "High School Chinese", + "\u9ad8\u4e2d\u8bed\u6587", + "Humanities", + ], + "high_school_history": [ + "High School History", + "\u9ad8\u4e2d\u5386\u53f2", + "Humanities", + ], + "middle_school_history": [ + "Middle School History", + "\u521d\u4e2d\u5386\u53f2", + "Humanities", + ], "civil_servant": ["Civil Servant", "\u516c\u52a1\u5458", "Other"], "sports_science": ["Sports Science", "\u4f53\u80b2\u5b66", "Other"], "plant_protection": ["Plant Protection", "\u690d\u7269\u4fdd\u62a4", "Other"], "basic_medicine": ["Basic Medicine", "\u57fa\u7840\u533b\u5b66", "Other"], "clinical_medicine": ["Clinical Medicine", "\u4e34\u5e8a\u533b\u5b66", "Other"], - "urban_and_rural_planner": ["Urban and Rural Planner", "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08", "Other"], + "urban_and_rural_planner": [ + "Urban and Rural Planner", + "\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08", + "Other", + ], "accountant": ["Accountant", "\u6ce8\u518c\u4f1a\u8ba1\u5e08", "Other"], - "fire_engineer": ["Fire Engineer", "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08", "Other"], - "environmental_impact_assessment_engineer": ["Environmental Impact Assessment Engineer", "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08", "Other"], + "fire_engineer": [ + "Fire Engineer", + "\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08", + "Other", + ], + "environmental_impact_assessment_engineer": [ + "Environmental Impact Assessment Engineer", + "\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08", + "Other", + ], "tax_accountant": ["Tax Accountant", "\u7a0e\u52a1\u5e08", "Other"], - "physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"] + "physician": ["Physician", "\u533b\u5e08\u8d44\u683c", "Other"], } -hard_list = ['advanced_mathematics', 'discrete_mathematics', 'probability_and_statistics', 'college_physics', 'college_chemistry', 'high_school_mathematics', 'high_school_physics', 'high_school_chemistry'] +hard_list = [ + "advanced_mathematics", + "discrete_mathematics", + "probability_and_statistics", + "college_physics", + "college_chemistry", + "high_school_mathematics", + "high_school_physics", + "high_school_chemistry", +] choices = ["A", "B", "C", "D"] @@ -257,34 +410,50 @@ def main(args): print("model loaded") dev_result = {} for subject_name in tqdm(TASK_NAME_MAPPING.keys()): - val_file_path = os.path.join(args.eval_data_path, 'val', f'{subject_name}_val.csv') - # dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}_dev.csv') - # test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}_test.csv') + val_file_path = os.path.join( + args.eval_data_path, "val", f"{subject_name}_val.csv" + ) val_df = pd.read_csv(val_file_path) - # dev_df = pd.read_csv(dev_file_path) - # test_df = pd.read_csv(test_file_path) - score = eval_subject(model, tokenizer, subject_name, val_df, - save_result_dir=f"outs_chat/ceval_eval_result", overwrite=args.overwrite) + score = eval_subject( + model, + tokenizer, + subject_name, + val_df, + save_result_dir="outs_chat/ceval_eval_result", + overwrite=args.overwrite, + ) dev_result[subject_name] = score cal_ceval(dev_result) -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B-Chat") - parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed') - - """Provide extra arguments required for tasks.""" - group = parser.add_argument_group(title='Evaluation options') - group.add_argument('-d', '--eval_data_path', type=str, required=True, - help='Path to eval data') - group.add_argument("--debug", action='store_true', default=False, - help='Print infos.') - group.add_argument("--overwrite", action='store_true', default=False, - help='Overwrite existed results') +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=str, + help="Checkpoint path", + default="Qwen/Qwen-7B-Chat", + ) + parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed") + + # Provide extra arguments required for tasks + group = parser.add_argument_group(title="Evaluation options") + group.add_argument( + "-d", "--eval_data_path", type=str, required=True, help="Path to eval data" + ) + group.add_argument( + "--debug", action="store_true", default=False, help="Print infos." + ) + group.add_argument( + "--overwrite", + action="store_true", + default=False, + help="Overwrite existed results", + ) args = parser.parse_args() set_seed(args.seed) - main(args) \ No newline at end of file + main(args) diff --git a/eval/evaluate_chat_gsm8k.py b/eval/evaluate_chat_gsm8k.py index 1358264..c4de01e 100644 --- a/eval/evaluate_chat_gsm8k.py +++ b/eval/evaluate_chat_gsm8k.py @@ -1,15 +1,10 @@ -import random -import tqdm -import os -import re -import sys -import torch -import numpy as np -import jsonlines -import argparse import json +import re from pathlib import Path -from datasets import load_from_disk,load_dataset +import argparse +import numpy as np +import tqdm +from datasets import load_from_disk, load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationConfig @@ -18,39 +13,41 @@ python eval/evaluate_chat_gsm8k.py [--use-fewshot] ''' INVALID_ANS = "[invalid]" -DEVICE = "cuda:0" +DEVICE = "cuda:0" def doc_to_text(doc, use_fewshot): if use_fewshot: - context = "Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\n" \ - "Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n\n" \ - "Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws. Their opponents score double the 2 pointers but half the 3 pointers and free throws. What's the total number of points scored by both teams added together?\nLet's think step by step\n" \ - "Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\nHis team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\nThey scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\nAll together his team scored 50+24+10= 84 points\nMark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\nHis opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\nThey also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\nAll together Mark's opponents scored 100+12+5=117 points\nThe total score for the game is both team's scores added together, so it is 84+117=201 points\nThe answer is 201\n\n" \ - "Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nLet's think step by step\n" \ - "When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n\n" \ - "Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\n" \ - "For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n\n" \ - f"Question: {doc['question']}\nLet's think step by step" + context = ( + "Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\n" + "Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n\n" + "Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws. Their opponents score double the 2 pointers but half the 3 pointers and free throws. What's the total number of points scored by both teams added together?\nLet's think step by step\n" + "Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\nHis team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\nThey scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\nAll together his team scored 50+24+10= 84 points\nMark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\nHis opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\nThey also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\nAll together Mark's opponents scored 100+12+5=117 points\nThe total score for the game is both team's scores added together, so it is 84+117=201 points\nThe answer is 201\n\n" + "Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nLet's think step by step\n" + "When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n\n" + "Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\n" + "For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n\n" + f"Question: {doc['question']}\nLet's think step by step" + ) else: - context = doc['question'] + context = doc["question"] return context + def decode(tokens_list, tokenizer, raw_text_len): sents = [] - # print(len(tokens_list)) for tokens in tokens_list: tokens = tokens.cpu().numpy().tolist() - sent = tokenizer.tokenizer.decode( - tokens[raw_text_len:]) - sent = sent.split('<|endoftext|>')[0] - sent = sent.split('\n\n\n')[0] + sent = tokenizer.tokenizer.decode(tokens[raw_text_len:]) + sent = sent.split("<|endoftext|>")[0] + sent = sent.split("\n\n\n")[0] sent = sent.split("\n\n")[0] sent = sent.split("Question:")[0] sents.append(sent) return sents + def generate_sample(model, tokenizer, question): - response, history = model.chat( + response, _ = model.chat( tokenizer, question, history=None, @@ -64,7 +61,9 @@ def generate_sample(model, tokenizer, question): def extract_answer_hf(completion): def _get_last_digit(s): - _PAT_LAST_DIGIT = re.compile(r"(?<=(\s|[\$%#{]))([+-])?(?=(\S))(0|([1-9](\d*|\d{0,2}(,\d{3})*)))?(\.\d*[1-9])?(?=(\s|[.,}]|$))") + _PAT_LAST_DIGIT = re.compile( + r"(?<=(\s|[\$%#{]))([+-])?(?=(\S))(0|([1-9](\d*|\d{0,2}(,\d{3})*)))?(\.\d*[1-9])?(?=(\s|[.,}]|$))" + ) match = list(_PAT_LAST_DIGIT.finditer(s)) if match: last_digit = match[-1].group().replace(",", "").replace("+", "") @@ -74,51 +73,66 @@ def extract_answer_hf(completion): print(f"No digits found in {s!r}") return last_digit - job_gen = completion.strip('.').replace('\n', '\\n') + job_gen = completion.strip(".").replace("\n", "\\n") last_digit = _get_last_digit(job_gen) if last_digit is not None: return eval(last_digit) - else: - return INVALID_ANS + return INVALID_ANS + def extract_answer(completion): try: - last_number = re.findall(r'\d+', completion)[-1] + last_number = re.findall(r"\d+", completion)[-1] return eval(last_number) except: return INVALID_ANS -def is_correct( completion, answer): + +def is_correct(completion, answer): gold = extract_answer(answer) assert gold != INVALID_ANS, "No ground truth answer found in the document." return extract_answer(completion) == gold -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument("-c", "--checkpoint-path", type=Path, help="Checkpoint path", default="Qwen/Qwen-7B-Chat") - parser.add_argument("-f","--sample-input-file", type=str, default=None) - parser.add_argument("-o","--sample-output-file", type=str, default="gsm8k_res.jsonl") +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=Path, + help="Checkpoint path", + default="Qwen/Qwen-7B-Chat", + ) + parser.add_argument("-f", "--sample-input-file", type=str, default=None) + parser.add_argument( + "-o", "--sample-output-file", type=str, default="gsm8k_res.jsonl" + ) parser.add_argument("--use-fewshot", action="store_true") args = parser.parse_args() if args.sample_input_file is not None: - dataset = load_from_disk(args.sample_input_file)# or: + dataset = load_from_disk(args.sample_input_file) # or: else: dataset = load_dataset("gsm8k", "main") - print('Loading tokenizer ...') - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True, bf16=True, use_flash_attn=True) + print("Loading tokenizer ...") + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True, bf16=True, use_flash_attn=True + ) - print('Loading model ...') - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval() - model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) - model.generation_config.do_sample = False # use greedy decoding + print("Loading model ...") + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, device_map="auto", trust_remote_code=True + ).eval() + model.generation_config = GenerationConfig.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model.generation_config.do_sample = False # use greedy decoding test = dataset["test"] - f_output = open(args.sample_output_file, 'w', encoding='utf-8') + f_output = open(args.sample_output_file, "w", encoding="utf-8") tot_length = test.num_rows acc_res = [] for doc in tqdm.tqdm(test): @@ -132,6 +146,6 @@ if __name__ == '__main__': f_output.write(json.dumps(doc, ensure_ascii=False) + "\n") f_output.flush() acc_res.append(acc) - + f_output.close() print("4-shot Acc: " if args.use_fewshot else "Zero-shot Acc", np.mean(acc_res)) diff --git a/eval/evaluate_chat_humaneval.py b/eval/evaluate_chat_humaneval.py index c80c195..66dcec8 100644 --- a/eval/evaluate_chat_humaneval.py +++ b/eval/evaluate_chat_humaneval.py @@ -1,14 +1,10 @@ -import random -import tqdm -import os -import sys -import torch -import jsonlines -import argparse -import jsonlines -from pathlib import Path + import re import textwrap +import argparse +from pathlib import Path +import tqdm +import jsonlines from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationConfig @@ -24,25 +20,31 @@ evaluate_functional_correctness HumanEval_res.jsonl DEVICE = "cuda:0" def extract_code(text, entry_point): - # 正则表达式匹配代码块 - code_block_pattern = re.compile(rf"```(?:[Pp]ython\n)?.*?def\s+{entry_point}.*?:\n(.*?)\n```", re.DOTALL) + code_block_pattern = re.compile( + rf"```(?:[Pp]ython\n)?.*?def\s+{entry_point}.*?:\n(.*?)\n```", re.DOTALL + ) code_block = code_block_pattern.search(text) if code_block is None: - code_block_pattern = re.compile(rf"def\s+{entry_point}.*?:\n(.*?)(?:\n(?!\n*(?: |\t))|$)", re.DOTALL) + code_block_pattern = re.compile( + rf"def\s+{entry_point}.*?:\n(.*?)(?:\n(?!\n*(?: |\t))|$)", re.DOTALL + ) code_block = code_block_pattern.search(text) if code_block is None: - code_block_pattern = re.compile(rf"def.*?:\n(.*?)(?:\n(?!\n*(?: |\t))|$)", re.DOTALL) + code_block_pattern = re.compile( + r"def.*?:\n(.*?)(?:\n(?!\n*(?: |\t))|$)", re.DOTALL + ) code_block = code_block_pattern.search(text) if code_block is not None: return code_block.group(1) - else: - # if no code block is found, assume the LM is simply filling the code - return textwrap.indent(text, ' ' * 4) + + # if no code block is found, assume the LM is simply filling the code + return textwrap.indent(text, " " * 4) + def generate_sample(model, tokenizer, question, entry_point): - response, history = model.chat( + response, _ = model.chat( tokenizer, question, history=None, @@ -52,31 +54,56 @@ def generate_sample(model, tokenizer, question, entry_point): answer = extract_code(response, entry_point) return answer, response -if __name__ == '__main__': - - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument("-c", "--checkpoint-path", type=Path, help='Checkpoint path', default="Qwen/Qwen-7B-Chat") - parser.add_argument("-f","--sample-input-file", type=str, default=None, help="data path to HumanEval.jsonl") - parser.add_argument("-o","--sample-output-file", type=str, default="HumanEval_res.jsonl") +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=Path, + help="Checkpoint path", + default="Qwen/Qwen-7B-Chat", + ) + parser.add_argument( + "-f", + "--sample-input-file", + type=str, + default=None, + help="data path to HumanEval.jsonl", + ) + parser.add_argument( + "-o", "--sample-output-file", type=str, default="HumanEval_res.jsonl" + ) args = parser.parse_args() - print('Loading tokenizer ...') - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True) + print("Loading tokenizer ...") + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) - print('Loading model ...') - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True, bf16=True, use_flash_attn=True).eval() - model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) - model.generation_config.do_sample = False # use greedy decoding + print("Loading model ...") + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, + device_map="auto", + trust_remote_code=True, + bf16=True, + use_flash_attn=True, + ).eval() + model.generation_config = GenerationConfig.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model.generation_config.do_sample = False # use greedy decoding - f_output = jsonlines.Writer(open(args.sample_output_file, 'w', encoding='utf-8')) + f_output = jsonlines.Writer(open(args.sample_output_file, "w", encoding="utf-8")) f = jsonlines.open(args.sample_input_file) with f_output as output: - for jobj in tqdm.tqdm(f, desc='task_idx'): - prompt = "Help me fill the following code.\n" + jobj['prompt'] - task_id = jobj['task_id'] - answer, response = generate_sample(model, tokenizer, prompt, jobj['entry_point']) - gen_jobjs = {'task_id': task_id, "completion": answer, 'response': response} + for jobj in tqdm.tqdm(f, desc="task_idx"): + prompt = "Help me fill the following code.\n" + jobj["prompt"] + task_id = jobj["task_id"] + answer, response = generate_sample( + model, tokenizer, prompt, jobj["entry_point"] + ) + gen_jobjs = {"task_id": task_id, "completion": answer, "response": response} output.write(gen_jobjs) f_output.close() diff --git a/eval/evaluate_chat_mmlu.py b/eval/evaluate_chat_mmlu.py index 1fbf94e..259dc3a 100644 --- a/eval/evaluate_chat_mmlu.py +++ b/eval/evaluate_chat_mmlu.py @@ -1,14 +1,13 @@ import os -import pandas as pd -import numpy as np import argparse -import datasets -import torch import re -from thefuzz import process -from typing import List +import torch +import pandas as pd from tqdm import tqdm +from thefuzz import process from transformers.trainer_utils import set_seed +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers.generation import GenerationConfig ''' wget https://people.eecs.berkeley.edu/~hendrycks/data.tar @@ -22,18 +21,29 @@ python eval/evaluate_chat_mmlu.py -d data/mmlu/data/ ''' def load_models_tokenizer(args): - from transformers import AutoModelForCausalLM, AutoTokenizer - from transformers.generation import GenerationConfig - - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True, bf16=True, use_flash_attn=True).eval() - model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) - model.generation_config.do_sample = False # use greedy decoding + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, + device_map="auto", + trust_remote_code=True, + bf16=True, + use_flash_attn=True, + ).eval() + model.generation_config = GenerationConfig.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model.generation_config.do_sample = False # use greedy decoding return model, tokenizer def format_example(line): - example = 'The following is a multiple-choice question. Please choose the most suitable one among A, B, C and D as the answer to this question.\n\n' + line['question'] + "\n" + example = ( + "The following is a multiple-choice question. Please choose the most suitable one among A, B, C and D as the answer to this question.\n\n" + + line["question"] + + "\n" + ) for choice in choices: example += f'{choice}. {line[f"{choice}"]}\n' return example @@ -47,13 +57,20 @@ def process_before_extraction(gen, choice_dict): gen = pattern.sub(key, gen) return gen + def extract_choice(gen, choice_list): # answer is A | choice is A | choose A - res = re.search(r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^ABCD]{0,20}?(?:n't|not))[^ABCD]{0,10}?\b(?:|is|:|be))\b)[^ABCD]{0,20}?\b(A|B|C|D)\b", gen) + res = re.search( + r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^ABCD]{0,20}?(?:n't|not))[^ABCD]{0,10}?\b(?:|is|:|be))\b)[^ABCD]{0,20}?\b(A|B|C|D)\b", + gen, + ) # A is correct | A is right if res is None: - res = re.search(r"\b(A|B|C|D)\b(?![^ABCD]{0,8}?(?:n't|not)[^ABCD]{0,5}?(?:correct|right))[^ABCD]{0,10}?\b(?:correct|right)\b", gen) + res = re.search( + r"\b(A|B|C|D)\b(?![^ABCD]{0,8}?(?:n't|not)[^ABCD]{0,5}?(?:correct|right))[^ABCD]{0,10}?\b(?:correct|right)\b", + gen, + ) # straight answer: A if res is None: @@ -65,32 +82,37 @@ def extract_choice(gen, choice_list): if res is None: return choices[choice_list.index(process.extractOne(gen, choice_list)[0])] - else: - return res.group(1) + return res.group(1) + def extract_answer(response, row): - gen = process_before_extraction(response, {choice: row[choice] for choice in choices}) + gen = process_before_extraction( + response, {choice: row[choice] for choice in choices} + ) pred = extract_choice(gen, [row[choice] for choice in choices]) return pred + @torch.no_grad() def eval_subject( - model, - tokenizer, - subject_name, - test_df, - save_result_dir=None, - overwrite=False, - **kwargs + model, + tokenizer, + subject_name, + test_df, + save_result_dir=None, + overwrite=False, + **kwargs ): - result_path = os.path.join(save_result_dir, f'{subject_name}_result.csv') + result_path = os.path.join(save_result_dir, f"{subject_name}_result.csv") if not overwrite and os.path.exists(result_path): print(f"{result_path} existed, skip!") score = [] - for (_, datarow), (_, resultrow) in zip(test_df.iterrows(), pd.read_csv(result_path).iterrows()): + for (_, datarow), (_, resultrow) in zip( + test_df.iterrows(), pd.read_csv(result_path).iterrows() + ): # pred = extract_answer(resultrow['model_response'], datarow) - pred = resultrow['model_output'] - correct = 1 if pred == datarow['answer'] else 0 + pred = resultrow["model_output"] + correct = 1 if pred == datarow["answer"] else 0 score.append(correct) return score @@ -100,7 +122,7 @@ def eval_subject( for _, row in tqdm(test_df.iterrows(), total=len(test_df)): question = format_example(row) - response, history = model.chat( + response, _ = model.chat( tokenizer, question, history=None, @@ -111,20 +133,24 @@ def eval_subject( print(pred) print("======================") - if 'answer' in row: - correct = 1 if pred == row['answer'] else 0 + if "answer" in row: + correct = 1 if pred == row["answer"] else 0 score.append(correct) - if args.debug: print(f'{question} pred: {pred} ref: {row["answer"]}') + if args.debug: + print(f'{question} pred: {pred} ref: {row["answer"]}') result.append(pred) if save_result_dir: - test_df['model_output'] = result - test_df['model_response'] = response + test_df["model_output"] = result + test_df["model_response"] = response if score: test_df["correctness"] = score os.makedirs(save_result_dir, exist_ok=True) - test_df.to_csv(os.path.join( - save_result_dir, f'{subject_name}_result.csv'), encoding="utf-8", index=False) + test_df.to_csv( + os.path.join(save_result_dir, f"{subject_name}_result.csv"), + encoding="utf-8", + index=False, + ) return score @@ -133,15 +159,13 @@ def cal_mmlu(res): acc_sum_dict = dict() acc_norm_sum_dict = dict() cnt_dict = dict() - acc_sum = 0. + acc_sum = 0.0 cnt = 0 - hard_cnt = 0 - hard_acc_sum = 0. for class_ in TASK_NAME_MAPPING.keys(): - acc_sum_dict[class_] = 0. - acc_norm_sum_dict[class_] = 0. - cnt_dict[class_] = 0. + acc_sum_dict[class_] = 0.0 + acc_norm_sum_dict[class_] = 0.0 + cnt_dict[class_] = 0.0 for tt in TASK_NAME_MAPPING[class_]: acc_sum += sum(res[tt]) @@ -150,13 +174,12 @@ def cal_mmlu(res): acc_sum_dict[class_] += sum(res[tt]) cnt_dict[class_] += len(res[tt]) - print('\n\n\n') + print("\n\n\n") for k in TASK_NAME_MAPPING.keys(): if k in cnt_dict: - print('%s ACC: %.2f ' % ( - k, acc_sum_dict[k] * 100 / cnt_dict[k])) - print('AVERAGE ACC:%.2f ' % (acc_sum *100 / cnt)) - + print("%s ACC: %.2f " % (k, acc_sum_dict[k] * 100 / cnt_dict[k])) + print("AVERAGE ACC:%.2f " % (acc_sum * 100 / cnt)) + def main(args): print("loading model weights") @@ -170,38 +193,122 @@ def main(args): for subject_name in tqdm(SUBJECTS): # val_file_path = os.path.join(args.eval_data_path, 'val', f'{subject_name}_val.csv') # dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}_dev.csv') - test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}_test.csv') + test_file_path = os.path.join( + args.eval_data_path, "test", f"{subject_name}_test.csv" + ) # val_df = pd.read_csv(val_file_path, names=['question','A','B','C','D','answer']) # dev_df = pd.read_csv(dev_file_path, names=['question','A','B','C','D','answer']) - test_df = pd.read_csv(test_file_path, names=['question','A','B','C','D','answer']) + test_df = pd.read_csv( + test_file_path, names=["question", "A", "B", "C", "D", "answer"] + ) - score = eval_subject(model, tokenizer, subject_name, test_df, save_result_dir=f"outs_chat/mmlu_eval_result", overwrite=args.overwrite) + score = eval_subject( + model, + tokenizer, + subject_name, + test_df, + save_result_dir=f"outs_chat/mmlu_eval_result", + overwrite=args.overwrite, + ) dev_result[subject_name] = score cal_mmlu(dev_result) -TASK_NAME_MAPPING = {'stem': ['abstract_algebra', 'anatomy', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning'], - 'Humanities': ['formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions'], - 'other': ['business_ethics', 'college_medicine', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology', 'global_facts', 'clinical_knowledge'], - 'social': ['econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy']} +TASK_NAME_MAPPING = { + "stem": [ + "abstract_algebra", + "anatomy", + "astronomy", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_physics", + "computer_security", + "conceptual_physics", + "electrical_engineering", + "elementary_mathematics", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_mathematics", + "high_school_physics", + "high_school_statistics", + "machine_learning", + ], + "Humanities": [ + "formal_logic", + "high_school_european_history", + "high_school_us_history", + "high_school_world_history", + "international_law", + "jurisprudence", + "logical_fallacies", + "moral_disputes", + "moral_scenarios", + "philosophy", + "prehistory", + "professional_law", + "world_religions", + ], + "other": [ + "business_ethics", + "college_medicine", + "human_aging", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "nutrition", + "professional_accounting", + "professional_medicine", + "virology", + "global_facts", + "clinical_knowledge", + ], + "social": [ + "econometrics", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_microeconomics", + "high_school_psychology", + "human_sexuality", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + ], +} SUBJECTS = [v for vl in TASK_NAME_MAPPING.values() for v in vl] choices = ["A", "B", "C", "D"] -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B-Chat") - parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed') - - """Provide extra arguments required for tasks.""" - group = parser.add_argument_group(title='Evaluation options') - group.add_argument('-d', '--eval_data_path', type=str, - help='Path to eval data') - group.add_argument("--debug", action='store_true', default=False, - help='Print infos.') - group.add_argument("--overwrite", action='store_true', default=False, - help='Overwrite existed results') +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=str, + help="Checkpoint path", + default="Qwen/Qwen-7B-Chat", + ) + parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed") + + # Provide extra arguments required for tasks + group = parser.add_argument_group(title="Evaluation options") + group.add_argument("-d", "--eval_data_path", type=str, help="Path to eval data") + group.add_argument( + "--debug", action="store_true", default=False, help="Print infos." + ) + group.add_argument( + "--overwrite", + action="store_true", + default=False, + help="Overwrite existed results", + ) args = parser.parse_args() set_seed(args.seed) - main(args) \ No newline at end of file + main(args) diff --git a/eval/evaluate_cmmlu.py b/eval/evaluate_cmmlu.py index aafcc57..2d2371d 100644 --- a/eval/evaluate_cmmlu.py +++ b/eval/evaluate_cmmlu.py @@ -11,39 +11,46 @@ from tqdm import tqdm from transformers.trainer_utils import set_seed -''' +""" wget https://huggingface.co/datasets/haonan-li/cmmlu/resolve/main/cmmlu_v1_0_1.zip mkdir data/cmmlu mv cmmlu_v1_0_1.zip data/cmmlu cd data/cmmlu; unzip cmmlu_v1_0_1.zip cd ../../ python evaluate_cmmlu.py -d data/cmmlu/ -''' +""" + def load_models_tokenizer(args): from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationConfig - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval() - model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, device_map="auto", trust_remote_code=True + ).eval() + model.generation_config = GenerationConfig.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) return model, tokenizer def format_example(line, include_answer=True): - example = '问题:' + line['Question'] + example = "问题:" + line["Question"] for choice in choices: example += f'\n{choice}. {line[f"{choice}"]}' if include_answer: - example += '\n答案:' + line["Answer"] + '\n\n' + example += "\n答案:" + line["Answer"] + "\n\n" else: - example += '\n答案:' + example += "\n答案:" return example def generate_few_shot_prompt(k, subject, dev_df): - prompt = '' + prompt = "" if k == -1: k = dev_df.shape[0] for i in range(k): @@ -55,35 +62,37 @@ def generate_few_shot_prompt(k, subject, dev_df): def get_logits(tokenizer, model, inputs: List[str]): - input_ids = tokenizer(inputs, padding=False)['input_ids'] + input_ids = tokenizer(inputs, padding=False)["input_ids"] input_ids = torch.tensor(input_ids, device=model.device) - tokens = {'input_ids': input_ids} + tokens = {"input_ids": input_ids} - outputs = model(input_ids)['logits'] + outputs = model(input_ids)["logits"] logits = outputs[:, -1, :] log_probs = torch.nn.functional.softmax(logits, dim=-1) - return log_probs, {'tokens': tokens} + return log_probs, {"tokens": tokens} @torch.no_grad() def eval_subject( - model, - tokenizer, - subject_name, - test_df, - k=5, - dev_df=None, - few_shot=False, - save_result_dir=None, - **kwargs + model, + tokenizer, + subject_name, + test_df, + k=5, + dev_df=None, + few_shot=False, + save_result_dir=None, + **kwargs, ): result = [] score = [] - few_shot_prompt = generate_few_shot_prompt( - k, subject_name, dev_df) if few_shot else [] - all_probs = {'prob_A': [], 'prob_B': [], 'prob_C': [], 'prob_D': []} - if args.debug: print(f"few_shot_prompt: {few_shot_prompt}") + few_shot_prompt = ( + generate_few_shot_prompt(k, subject_name, dev_df) if few_shot else [] + ) + all_probs = {"prob_A": [], "prob_B": [], "prob_C": [], "prob_D": []} + if args.debug: + print(f"few_shot_prompt: {few_shot_prompt}") for _, row in tqdm(test_df.iterrows(), total=len(test_df)): question = format_example(row, include_answer=False) @@ -94,51 +103,56 @@ def eval_subject( logits = output.flatten() softval = torch.nn.functional.softmax( - torch.tensor( - [ - logits[tokenizer("A")['input_ids']], - logits[tokenizer("B")['input_ids']], - logits[tokenizer("C")['input_ids']], - logits[tokenizer("D")['input_ids']], - ] - ), - dim=0, - ) + torch.tensor( + [ + logits[tokenizer("A")["input_ids"]], + logits[tokenizer("B")["input_ids"]], + logits[tokenizer("C")["input_ids"]], + logits[tokenizer("D")["input_ids"]], + ] + ), + dim=0, + ) if softval.dtype in {torch.bfloat16, torch.float16}: softval = softval.to(dtype=torch.float32) probs = softval.detach().cpu().numpy() for i, choice in enumerate(choices): - all_probs[f'prob_{choice}'].append(probs[i]) + all_probs[f"prob_{choice}"].append(probs[i]) pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)] - if 'Answer' in row: - correct = 1 if pred == row['Answer'] else 0 + if "Answer" in row: + correct = 1 if pred == row["Answer"] else 0 score.append(correct) - if args.debug: print(f'{question} pred: {pred} ref: {row["Answer"]}') + if args.debug: + print(f'{question} pred: {pred} ref: {row["Answer"]}') result.append(pred) if score: correct_ratio = 100 * sum(score) / len(score) - if args.debug: print(subject_name, correct_ratio) + if args.debug: + print(subject_name, correct_ratio) else: correct_ratio = 0 if save_result_dir: - test_df['model_output'] = result + test_df["model_output"] = result for i, choice in enumerate(choices): - test_df[f'prob_{choice}'] = (all_probs[f'prob_{choice}']) + test_df[f"prob_{choice}"] = all_probs[f"prob_{choice}"] if score: test_df["correctness"] = score os.makedirs(save_result_dir, exist_ok=True) - test_df.to_csv(os.path.join( - save_result_dir, f'{subject_name}_result.csv'), encoding="utf-8", index=False) + test_df.to_csv( + os.path.join(save_result_dir, f"{subject_name}_result.csv"), + encoding="utf-8", + index=False, + ) return correct_ratio def cal_cmmlu(res): - print('\n\n\n') - res = {k.split('-')[-1]:float(v) for k,v in res.items()} + print("\n\n\n") + res = {k.split("-")[-1]: float(v) for k, v in res.items()} for k, v in TASK_NAME_MAPPING.items(): avg_acc = np.mean(list(map(lambda x: res[x], v))) print(f"{k} acc: {avg_acc:.2f}") @@ -147,85 +161,103 @@ def cal_cmmlu(res): subcategories = { - "agronomy": ['other'], - "anatomy": ['biology'], - "ancient_chinese": ['linguistics','china specific'], - "arts": ['arts'], - "astronomy": ['physics'], - "business_ethics": ['business'], - "chinese_civil_service_exam": ['politics','china specific'], - "chinese_driving_rule": ['other','china specific'], - "chinese_food_culture": ['culture','china specific'], - "chinese_foreign_policy": ['politics','china specific'], - "chinese_history":['history','china specific'], - "chinese_literature": ['literature','china specific'], - "chinese_teacher_qualification": ['education','china specific'], - "college_actuarial_science":['math'], - "college_education":['education'], - "college_engineering_hydrology": ['engineering'], - "college_law": ['law'], - "college_mathematics": ['math'], - "college_medical_statistics":['statistics'], - "clinical_knowledge": ['other'], - "college_medicine": ['other'], - "computer_science": ['computer science'], - "computer_security": ['other'], - "conceptual_physics": ['physics'], - "construction_project_management": ['other','china specific'], - "economics": ['economics'], - "education": ['education'], - "elementary_chinese":['linguistics','china specific'], - "elementary_commonsense":['other','china specific'], - "elementary_information_and_technology": ['other'], - "electrical_engineering": ['engineering'], - "elementary_mathematics": ['math'], - "ethnology": ['culture','china specific'], - "food_science": ['other'], - "genetics": ['biology'], - "global_facts": ['global'], - "high_school_biology": ['biology'], - "high_school_chemistry": ['chemistry'], - "high_school_geography": ['geography'], - "high_school_mathematics": ['math'], - "high_school_physics": ['physics'], - "high_school_politics": ['politics','china specific'], - "human_sexuality": ['other'], - "international_law": ['law'], - "journalism": ['sociology'], - "jurisprudence": ['law'], - "legal_and_moral_basis": ['other'], - "logical": ['philosophy'], - "machine_learning": ['computer science'], - "management": ['business'], - "marketing": ['business'], - "marxist_theory": ['philosophy'], - "modern_chinese": ['linguistics','china specific'], - "nutrition": ['other'], - "philosophy": ['philosophy'], - "professional_accounting": ['business'], - "professional_law": ['law'], - "professional_medicine": ['other'], - "professional_psychology": ['psychology'], - "public_relations": ['politics'], - "security_study": ['politics'], - "sociology": ['culture'], - "sports_science": ['other'], - "traditional_chinese_medicine": ['other','china specific'], - "virology": ['biology'], - "world_history":['history'], - "world_religions": ['global'], + "agronomy": ["other"], + "anatomy": ["biology"], + "ancient_chinese": ["linguistics", "china specific"], + "arts": ["arts"], + "astronomy": ["physics"], + "business_ethics": ["business"], + "chinese_civil_service_exam": ["politics", "china specific"], + "chinese_driving_rule": ["other", "china specific"], + "chinese_food_culture": ["culture", "china specific"], + "chinese_foreign_policy": ["politics", "china specific"], + "chinese_history": ["history", "china specific"], + "chinese_literature": ["literature", "china specific"], + "chinese_teacher_qualification": ["education", "china specific"], + "college_actuarial_science": ["math"], + "college_education": ["education"], + "college_engineering_hydrology": ["engineering"], + "college_law": ["law"], + "college_mathematics": ["math"], + "college_medical_statistics": ["statistics"], + "clinical_knowledge": ["other"], + "college_medicine": ["other"], + "computer_science": ["computer science"], + "computer_security": ["other"], + "conceptual_physics": ["physics"], + "construction_project_management": ["other", "china specific"], + "economics": ["economics"], + "education": ["education"], + "elementary_chinese": ["linguistics", "china specific"], + "elementary_commonsense": ["other", "china specific"], + "elementary_information_and_technology": ["other"], + "electrical_engineering": ["engineering"], + "elementary_mathematics": ["math"], + "ethnology": ["culture", "china specific"], + "food_science": ["other"], + "genetics": ["biology"], + "global_facts": ["global"], + "high_school_biology": ["biology"], + "high_school_chemistry": ["chemistry"], + "high_school_geography": ["geography"], + "high_school_mathematics": ["math"], + "high_school_physics": ["physics"], + "high_school_politics": ["politics", "china specific"], + "human_sexuality": ["other"], + "international_law": ["law"], + "journalism": ["sociology"], + "jurisprudence": ["law"], + "legal_and_moral_basis": ["other"], + "logical": ["philosophy"], + "machine_learning": ["computer science"], + "management": ["business"], + "marketing": ["business"], + "marxist_theory": ["philosophy"], + "modern_chinese": ["linguistics", "china specific"], + "nutrition": ["other"], + "philosophy": ["philosophy"], + "professional_accounting": ["business"], + "professional_law": ["law"], + "professional_medicine": ["other"], + "professional_psychology": ["psychology"], + "public_relations": ["politics"], + "security_study": ["politics"], + "sociology": ["culture"], + "sports_science": ["other"], + "traditional_chinese_medicine": ["other", "china specific"], + "virology": ["biology"], + "world_history": ["history"], + "world_religions": ["global"], } categories = { - "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering", "statistics"], + "STEM": [ + "physics", + "chemistry", + "biology", + "computer science", + "math", + "engineering", + "statistics", + ], "Humanities": ["history", "philosophy", "law", "arts", "literature", "global"], - "Social Science": ['linguistics',"business", "politics", "culture", "economics", "geography", "psychology", "education", "sociology"], - "Other":["other"], + "Social Science": [ + "linguistics", + "business", + "politics", + "culture", + "economics", + "geography", + "psychology", + "education", + "sociology", + ], + "Other": ["other"], "China specific": ["china specific"], } TASK_NAME_MAPPING = defaultdict(list) -for k,v in categories.items(): +for k, v in categories.items(): for subject, subcat in subcategories.items(): for c in subcat: if c in v: @@ -240,30 +272,52 @@ def main(args): test_result = {} for subject_name in tqdm(subcategories.keys()): - dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}.csv') - test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}.csv') + dev_file_path = os.path.join(args.eval_data_path, "dev", f"{subject_name}.csv") + test_file_path = os.path.join( + args.eval_data_path, "test", f"{subject_name}.csv" + ) dev_df = pd.read_csv(dev_file_path) test_df = pd.read_csv(test_file_path) - score = eval_subject(model, tokenizer, subject_name, dev_df=dev_df, test_df=test_df, k=5, few_shot=True, - save_result_dir=f"outs/cmmlu_eval_result") + score = eval_subject( + model, + tokenizer, + subject_name, + dev_df=dev_df, + test_df=test_df, + k=5, + few_shot=True, + save_result_dir=f"outs/cmmlu_eval_result", + ) test_result[subject_name] = score cal_cmmlu(test_result) -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B") - parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed') +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=str, + help="Checkpoint path", + default="Qwen/Qwen-7B", + ) + parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed") """Provide extra arguments required for tasks.""" - group = parser.add_argument_group(title='Evaluation options') - group.add_argument('-d', '--eval_data_path', type=str, required=True, - help='Path to eval data') - group.add_argument("--max-seq-len", type=int, default=2048, - help='Size of the output generated text.') - group.add_argument("--debug", action='store_true', default=False, - help='Print infos.') + group = parser.add_argument_group(title="Evaluation options") + group.add_argument( + "-d", "--eval_data_path", type=str, required=True, help="Path to eval data" + ) + group.add_argument( + "--max-seq-len", + type=int, + default=2048, + help="Size of the output generated text.", + ) + group.add_argument( + "--debug", action="store_true", default=False, help="Print infos." + ) args = parser.parse_args() set_seed(args.seed) diff --git a/eval/evaluate_gsm8k.py b/eval/evaluate_gsm8k.py index 49d69c8..d3c5d37 100644 --- a/eval/evaluate_gsm8k.py +++ b/eval/evaluate_gsm8k.py @@ -1,15 +1,10 @@ -import random -import tqdm -import os import re -import sys import torch -import numpy as np -import jsonlines import argparse import jsonlines +import numpy as np import datasets -from datasets import load_from_disk,load_dataset +from datasets import load_from_disk, load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationConfig @@ -17,31 +12,37 @@ from transformers.generation import GenerationConfig ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)") INVALID_ANS = "[invalid]" + def doc_to_text(doc): - return fewshot_prompt + "\nQuestion: " + doc["question"] + "\nLet's think step by step\n" + return ( + fewshot_prompt + + "\nQuestion: " + + doc["question"] + + "\nLet's think step by step\n" + ) + def decode(tokens_list, tokenizer, raw_text_len): sents = [] # print(len(tokens_list)) for tokens in tokens_list: tokens = tokens.cpu().numpy().tolist() - sent = tokenizer.tokenizer.decode( - tokens[raw_text_len:]) - sent = sent.split('<|endoftext|>')[0] - sent = sent.split('\n\n\n')[0] + sent = tokenizer.tokenizer.decode(tokens[raw_text_len:]) + sent = sent.split("<|endoftext|>")[0] + sent = sent.split("\n\n\n")[0] sent = sent.split("\n\n")[0] sent = sent.split("Question:")[0] sents.append(sent) return sents + def generate_sample(model, tokenizer, input_txt): input_ids = tokenizer.tokenizer.encode(input_txt) raw_text_len = len(input_ids) - context_enc = torch.tensor( - [input_ids]).to(model.device) + context_enc = torch.tensor([input_ids]).to(model.device) print(f"Input text: {input_txt}\n") outputs = model.generate(context_enc) - output_text = decode(outputs,tokenizer,raw_text_len)[0] + output_text = decode(outputs, tokenizer, raw_text_len)[0] print(f"\nOutput text: {output_text}\n") return output_text @@ -55,24 +56,34 @@ def extract_answer_hf(completion): else: return INVALID_ANS + def extract_answer(completion): try: - last_number = re.findall(r'\d+', completion)[-1] + last_number = re.findall(r"\d+", completion)[-1] return eval(last_number) except: return INVALID_ANS -def is_correct( completion, answer): + +def is_correct(completion, answer): gold = extract_answer_hf(answer) assert gold != INVALID_ANS, "No ground truth answer found in the document." return extract_answer(completion) == gold -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument("-c", "--checkpoint-path", type=str, help="Checkpoint path", default="Qwen/Qwen-7B") - parser.add_argument("-f","--sample-input-file", type=str, default=None) - parser.add_argument("-o","--sample-output-file", type=str, default="gsm8k_res.jsonl") +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=str, + help="Checkpoint path", + default="Qwen/Qwen-7B", + ) + parser.add_argument("-f", "--sample-input-file", type=str, default=None) + parser.add_argument( + "-o", "--sample-output-file", type=str, default="gsm8k_res.jsonl" + ) args = parser.parse_args() @@ -80,31 +91,37 @@ if __name__ == '__main__': if args.sample_input_file is not None: dataset = load_from_disk(args.sample_input_file) else: - config = datasets.DownloadConfig(resume_download=True, max_retries=100) - dataset = load_dataset("gsm8k", 'main', download_config=config) + config = datasets.DownloadConfig(resume_download=True, max_retries=100) + dataset = load_dataset("gsm8k", "main", download_config=config) test = dataset["test"] - print('Loading tokenizer ...') - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True) - - print('Loading model ...') - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval() - model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) + print("Loading tokenizer ...") + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + + print("Loading model ...") + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, device_map="auto", trust_remote_code=True + ).eval() + model.generation_config = GenerationConfig.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) model.generation_config.do_sample = False - - f_output = jsonlines.Writer(open(args.sample_output_file, 'w', encoding='utf-8')) + + f_output = jsonlines.Writer(open(args.sample_output_file, "w", encoding="utf-8")) tot_length = test.num_rows acc_res = [] for doc in test: context = doc_to_text(doc) completion = generate_sample(model, tokenizer, context) - answer= doc["answer"] + answer = doc["answer"] acc = is_correct(completion, answer) - doc["completion"]=completion - doc["acc"]=acc + doc["completion"] = completion + doc["acc"] = acc f_output.write(doc) acc_res.append(acc) - + f_output.close() - print("Acc: ",np.mean(acc_res)) \ No newline at end of file + print("Acc: ", np.mean(acc_res)) diff --git a/eval/evaluate_humaneval.py b/eval/evaluate_humaneval.py index af78319..78eb744 100644 --- a/eval/evaluate_humaneval.py +++ b/eval/evaluate_humaneval.py @@ -1,11 +1,7 @@ -import random +import argparse import tqdm -import os -import sys import torch import jsonlines -import argparse -import jsonlines from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationConfig @@ -15,56 +11,75 @@ $ pip install -e human-eval evaluate_functional_correctness sample-output-file """ + def decode(tokens_list, tokenizer, raw_text_len): sents = [] # print(len(tokens_list)) for tokens in tokens_list: tokens = tokens.cpu().numpy().tolist() - sent = tokenizer.tokenizer.decode( - tokens[raw_text_len:]) - sent = sent.split('<|endoftext|>')[0] - sent = sent.split('\n\n\n')[0] + sent = tokenizer.tokenizer.decode(tokens[raw_text_len:]) + sent = sent.split("<|endoftext|>")[0] + sent = sent.split("\n\n\n")[0] sent = sent.split("\n\n")[0] sent = sent.split("def ")[0] sents.append(sent) return sents + def generate_sample(model, tokenizer, input_txt): input_ids = tokenizer.tokenizer.encode(input_txt) raw_text_len = len(input_ids) - context_enc = torch.tensor([input_ids] ).to(model.device) + context_enc = torch.tensor([input_ids]).to(model.device) print(f"Input text: {input_txt}\n") outputs = model.generate(context_enc) - output_text = decode(outputs,tokenizer,raw_text_len)[0] + output_text = decode(outputs, tokenizer, raw_text_len)[0] print(f"\nOutput text: \n{output_text}\n") return output_text -if __name__ == '__main__': - - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument("-c", "--checkpoint-path", type=str, help='Checkpoint path', default="Qwen/Qwen-7B") - parser.add_argument("-f","--sample-input-file", type=str, default=None, help="data path to HumanEval.jsonl") - parser.add_argument("-o","--sample-output-file", type=str, default="HumanEval_res.jsonl") - +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=str, + help="Checkpoint path", + default="Qwen/Qwen-7B", + ) + parser.add_argument( + "-f", + "--sample-input-file", + type=str, + default=None, + help="data path to HumanEval.jsonl", + ) + parser.add_argument( + "-o", "--sample-output-file", type=str, default="HumanEval_res.jsonl" + ) args = parser.parse_args() - print('Loading tokenizer ...') - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True) + print("Loading tokenizer ...") + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) - print('Loading model ...') - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval() - model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) + print("Loading model ...") + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, device_map="auto", trust_remote_code=True + ).eval() + model.generation_config = GenerationConfig.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) model.generation_config.do_sample = False - - f_output = jsonlines.Writer(open(args.sample_output_file, 'w', encoding='utf-8')) + + f_output = jsonlines.Writer(open(args.sample_output_file, "w", encoding="utf-8")) f = jsonlines.open(args.sample_input_file) with f_output as output: - for jobj in tqdm.tqdm(f, desc='task_idx'): - prompt = jobj['prompt'] - task_id = jobj['task_id'] + for jobj in tqdm.tqdm(f, desc="task_idx"): + prompt = jobj["prompt"] + task_id = jobj["task_id"] gen_sents = generate_sample(model, tokenizer, prompt) - gen_jobjs = {'task_id': task_id, "completion": gen_sents} + gen_jobjs = {"task_id": task_id, "completion": gen_sents} output.write(gen_jobjs) - f_output.close() \ No newline at end of file + f_output.close() diff --git a/eval/evaluate_mmlu.py b/eval/evaluate_mmlu.py index 1b6970c..2843434 100644 --- a/eval/evaluate_mmlu.py +++ b/eval/evaluate_mmlu.py @@ -1,57 +1,60 @@ import os +from typing import List import pandas as pd import numpy as np import argparse -import datasets import torch - -from typing import List from tqdm import tqdm from transformers.trainer_utils import set_seed +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers.generation import GenerationConfig - -''' +""" wget https://people.eecs.berkeley.edu/~hendrycks/data.tar mkdir data/mmlu mv data.tar data/mmlu cd data/mmlu; tar xf data.tar cd ../../ python eval/evaluate_mmlu.py -d data/mmlu/data/ -''' +""" def load_models_tokenizer(args): - from transformers import AutoModelForCausalLM, AutoTokenizer - from transformers.generation import GenerationConfig - - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval() - model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, device_map="auto", trust_remote_code=True + ).eval() + model.generation_config = GenerationConfig.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) return model, tokenizer def format_example(line, include_answer=True): - example = 'Question: ' + line['question'] + example = "Question: " + line["question"] for choice in choices: example += f'\n{choice}. {line[f"{choice}"]}' - + if include_answer: - example += '\nAnswer: ' + line["answer"] + '\n\n' + example += "\nAnswer: " + line["answer"] + "\n\n" else: - example += '\nAnswer:' + example += "\nAnswer:" return example def generate_few_shot_prompt(k, subject, dev_df): - def format_subject(subject): l = subject.split("_") s = "" for entry in l: s += " " + entry return s.strip() - - prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(format_subject(subject)) + + prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format( + format_subject(subject) + ) if k == -1: k = dev_df.shape[0] @@ -64,81 +67,87 @@ def generate_few_shot_prompt(k, subject, dev_df): def get_logits(tokenizer, model, inputs: List[str]): - input_ids = tokenizer(inputs, padding=False)['input_ids'] + input_ids = tokenizer(inputs, padding=False)["input_ids"] input_ids = torch.tensor(input_ids, device=model.device) if input_ids.shape[1] > args.max_seq_len: - input_ids = input_ids[:, input_ids.shape[1]-args.max_seq_len+1:] - tokens = {'input_ids': input_ids} + input_ids = input_ids[:, input_ids.shape[1] - args.max_seq_len + 1 :] + tokens = {"input_ids": input_ids} - outputs = model(input_ids)['logits'] + outputs = model(input_ids)["logits"] logits = outputs[:, -1, :] log_probs = torch.nn.functional.softmax(logits, dim=-1) - return log_probs, {'tokens': tokens} + return log_probs, {"tokens": tokens} @torch.no_grad() def eval_subject( - model, - tokenizer, - subject_name, - test_df, - k=5, - dev_df=None, - few_shot=False, - save_result_dir=None, - **kwargs + model, + tokenizer, + subject_name, + test_df, + k=5, + dev_df=None, + few_shot=False, + save_result_dir=None, + **kwargs, ): result = [] score = [] - few_shot_prompt = generate_few_shot_prompt( - k, subject_name, dev_df) if few_shot else [] - all_probs = {'prob_A': [], 'prob_B': [], 'prob_C': [], 'prob_D': []} - if args.debug: print(f"few_shot_prompt: {few_shot_prompt}") + few_shot_prompt = ( + generate_few_shot_prompt(k, subject_name, dev_df) if few_shot else [] + ) + all_probs = {"prob_A": [], "prob_B": [], "prob_C": [], "prob_D": []} + if args.debug: + print(f"few_shot_prompt: {few_shot_prompt}") for _, row in tqdm(test_df.iterrows(), total=len(test_df)): question = format_example(row, include_answer=False) full_prompt = few_shot_prompt + question - + output, input_info = get_logits(tokenizer, model, [full_prompt]) assert output.shape[0] == 1 logits = output.flatten() softval = torch.nn.functional.softmax( - torch.tensor( - [ - logits[tokenizer(" A")['input_ids']], - logits[tokenizer(" B")['input_ids']], - logits[tokenizer(" C")['input_ids']], - logits[tokenizer(" D")['input_ids']], - ] - ), - dim=0, - ) + torch.tensor( + [ + logits[tokenizer(" A")["input_ids"]], + logits[tokenizer(" B")["input_ids"]], + logits[tokenizer(" C")["input_ids"]], + logits[tokenizer(" D")["input_ids"]], + ] + ), + dim=0, + ) if softval.dtype in {torch.bfloat16, torch.float16}: softval = softval.to(dtype=torch.float32) probs = softval.detach().cpu().numpy() for i, choice in enumerate(choices): - all_probs[f'prob_{choice}'].append(probs[i]) + all_probs[f"prob_{choice}"].append(probs[i]) pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)] - if 'answer' in row: - correct = 1 if pred == row['answer'] else 0 + if "answer" in row: + correct = 1 if pred == row["answer"] else 0 score.append(correct) - if args.debug: print(f'{question} pred: {pred} ref: {row["answer"]}') + if args.debug: + print(f'{question} pred: {pred} ref: {row["answer"]}') result.append(pred) if save_result_dir: - test_df['model_output'] = result + test_df["model_output"] = result for i, choice in enumerate(choices): - test_df[f'prob_{choice}'] = (all_probs[f'prob_{choice}']) + test_df[f"prob_{choice}"] = all_probs[f"prob_{choice}"] if score: test_df["correctness"] = score os.makedirs(save_result_dir, exist_ok=True) - test_df.to_csv(os.path.join( - save_result_dir, f'{subject_name}_result.csv'), encoding="utf-8", index=False) + test_df.to_csv( + os.path.join(save_result_dir, f"{subject_name}_result.csv"), + encoding="utf-8", + index=False, + ) return score @@ -147,15 +156,15 @@ def cal_mmlu(res): acc_sum_dict = dict() acc_norm_sum_dict = dict() cnt_dict = dict() - acc_sum = 0. + acc_sum = 0.0 cnt = 0 hard_cnt = 0 - hard_acc_sum = 0. + hard_acc_sum = 0.0 for class_ in TASK_NAME_MAPPING.keys(): - acc_sum_dict[class_] = 0. - acc_norm_sum_dict[class_] = 0. - cnt_dict[class_] = 0. + acc_sum_dict[class_] = 0.0 + acc_norm_sum_dict[class_] = 0.0 + cnt_dict[class_] = 0.0 for tt in TASK_NAME_MAPPING[class_]: acc_sum += sum(res[tt]) @@ -164,13 +173,12 @@ def cal_mmlu(res): acc_sum_dict[class_] += sum(res[tt]) cnt_dict[class_] += len(res[tt]) - print('\n\n\n', 'total cnt:', cnt, '\n') + print("\n\n\n", "total cnt:", cnt, "\n") for k in TASK_NAME_MAPPING.keys(): if k in cnt_dict: - print('%s ACC: %.2f ' % ( - k, acc_sum_dict[k] / cnt_dict[k] * 100)) - print('AVERAGE ACC:%.2f ' % (acc_sum / cnt * 100)) - + print("%s ACC: %.2f " % (k, acc_sum_dict[k] / cnt_dict[k] * 100)) + print("AVERAGE ACC:%.2f " % (acc_sum / cnt * 100)) + def main(args): model, tokenizer = load_models_tokenizer(args) @@ -178,41 +186,130 @@ def main(args): dev_result = {} for subject_name in tqdm(SUBJECTS): # val_file_path = os.path.join(args.eval_data_path, 'val', f'{subject_name}_val.csv') - dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}_dev.csv') - test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}_test.csv') + dev_file_path = os.path.join( + args.eval_data_path, "dev", f"{subject_name}_dev.csv" + ) + test_file_path = os.path.join( + args.eval_data_path, "test", f"{subject_name}_test.csv" + ) # val_df = pd.read_csv(val_file_path, names=['question','A','B','C','D','answer']) - dev_df = pd.read_csv(dev_file_path, names=['question','A','B','C','D','answer']) - test_df = pd.read_csv(test_file_path, names=['question','A','B','C','D','answer']) + dev_df = pd.read_csv( + dev_file_path, names=["question", "A", "B", "C", "D", "answer"] + ) + test_df = pd.read_csv( + test_file_path, names=["question", "A", "B", "C", "D", "answer"] + ) - score = eval_subject(model, tokenizer, subject_name, test_df, dev_df=dev_df, k=5, few_shot=True, - save_result_dir=f"outs/mmlu_eval_result") + score = eval_subject( + model, + tokenizer, + subject_name, + test_df, + dev_df=dev_df, + k=5, + few_shot=True, + save_result_dir=f"outs/mmlu_eval_result", + ) dev_result[subject_name] = score cal_mmlu(dev_result) -TASK_NAME_MAPPING = {'stem': ['abstract_algebra', 'anatomy', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning'], - 'Humanities': ['formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions'], - 'other': ['business_ethics', 'college_medicine', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology', 'global_facts', 'clinical_knowledge'], - 'social': ['econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy']} +TASK_NAME_MAPPING = { + "stem": [ + "abstract_algebra", + "anatomy", + "astronomy", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_physics", + "computer_security", + "conceptual_physics", + "electrical_engineering", + "elementary_mathematics", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_mathematics", + "high_school_physics", + "high_school_statistics", + "machine_learning", + ], + "Humanities": [ + "formal_logic", + "high_school_european_history", + "high_school_us_history", + "high_school_world_history", + "international_law", + "jurisprudence", + "logical_fallacies", + "moral_disputes", + "moral_scenarios", + "philosophy", + "prehistory", + "professional_law", + "world_religions", + ], + "other": [ + "business_ethics", + "college_medicine", + "human_aging", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "nutrition", + "professional_accounting", + "professional_medicine", + "virology", + "global_facts", + "clinical_knowledge", + ], + "social": [ + "econometrics", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_microeconomics", + "high_school_psychology", + "human_sexuality", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + ], +} SUBJECTS = [v for vl in TASK_NAME_MAPPING.values() for v in vl] choices = ["A", "B", "C", "D"] -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B") - parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed') - parser.add_argument('--gpu', type=int, default=0, help='gpu id') +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=str, + help="Checkpoint path", + default="Qwen/Qwen-7B", + ) + parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed") + parser.add_argument("--gpu", type=int, default=0, help="gpu id") """Provide extra arguments required for tasks.""" - group = parser.add_argument_group(title='Evaluation options') - group.add_argument('-d', '--eval_data_path', type=str, - help='Path to eval data') - group.add_argument("--max-seq-len", type=int, default=2048, - help='Size of the output generated text.') - group.add_argument("--debug", action='store_true', default=False, - help='Print infos.') + group = parser.add_argument_group(title="Evaluation options") + group.add_argument("-d", "--eval_data_path", type=str, help="Path to eval data") + group.add_argument( + "--max-seq-len", + type=int, + default=2048, + help="Size of the output generated text.", + ) + group.add_argument( + "--debug", action="store_true", default=False, help="Print infos." + ) args = parser.parse_args() set_seed(args.seed) - main(args) \ No newline at end of file + main(args) diff --git a/eval/evaluate_plugin.py b/eval/evaluate_plugin.py index 89974ad..f3b953b 100644 --- a/eval/evaluate_plugin.py +++ b/eval/evaluate_plugin.py @@ -12,47 +12,48 @@ from transformers.generation import GenerationConfig from transformers.tools.evaluate_agent import evaluate_agent from transformers.trainer_utils import set_seed -data_root_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - 'data') +data_root_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") def is_callable(response, golden): - return response['action'].strip().lower() == golden['action'].strip( - ).lower() + return response["action"].strip().lower() == golden["action"].strip().lower() def process_res(response): # parse response - response += '\n' # fix not-find bug - thought = response[:response.find('Action:')].strip() - action = response[response.find('Action:') + - len('Action:'):response.find('Action Input:')].strip() - action_input = response[response.find('Action Input:') + - len('Action Input:'):response.find('Observation:' - )].strip() - #TODO: This parsing result is incorrect if the response contains multiple Actions. To be fixed in the future. - observation = response[response.find('Observation:') + - len('Observation:'):response.rfind('Thought:' - )].strip() - thought_last = response[response.rfind('Thought:') + - len('Thought:'):response.find('Final Answer:' - )].strip() - final_answer = response[response.find('Final Answer:') + - len('Final Answer:'):].strip() + response += "\n" # fix not-find bug + thought = response[: response.find("Action:")].strip() + action = response[ + response.find("Action:") + len("Action:") : response.find("Action Input:") + ].strip() + action_input = response[ + response.find("Action Input:") + + len("Action Input:") : response.find("Observation:") + ].strip() + # TODO: This parsing result is incorrect if the response contains multiple Actions. To be fixed in the future. + observation = response[ + response.find("Observation:") + len("Observation:") : response.rfind("Thought:") + ].strip() + thought_last = response[ + response.rfind("Thought:") + len("Thought:") : response.find("Final Answer:") + ].strip() + final_answer = response[ + response.find("Final Answer:") + len("Final Answer:") : + ].strip() try: - action_input = json.dumps(json5.loads(action_input), - ensure_ascii=False, - sort_keys=True) + action_input = json.dumps( + json5.loads(action_input), ensure_ascii=False, sort_keys=True + ) except: # print("JSON Load Error:", action_input) pass res_dict = { - 'thought': thought, - 'action': action, - 'action_input': action_input, - 'observation': observation, - 'thought_last': thought_last, - 'final_answer': final_answer + "thought": thought, + "action": action, + "action_input": action_input, + "observation": observation, + "thought_last": thought_last, + "final_answer": final_answer, } return res_dict @@ -68,20 +69,18 @@ def _get_tokenized_string(tokenizer, text_list): assert tokenizer is not None token_ids = tokenizer.encode(text) tokens_bytes = tokenizer.convert_ids_to_tokens(token_ids) - tokens = [ - token.decode('utf-8', errors='replace') for token in tokens_bytes - ] - tokenized_string = ' '.join(tokens) + tokens = [token.decode("utf-8", errors="replace") for token in tokens_bytes] + tokenized_string = " ".join(tokens) token_ids_list.append(token_ids) tokenized_string_list.append(tokenized_string) return token_ids_list, tokenized_string_list def eval_action(job): - response = job['gen'][0] - golden = job['response'] + response = job["gen"][0] + golden = job["response"] - if 'Action:' in response: + if "Action:" in response: response, golden = process_res(response), process_res(golden) if is_callable(response, golden): return True @@ -89,26 +88,29 @@ def eval_action(job): def eval_action_input(job, tokenizer): - response = job['gen'][0] - golden = job['response'] + response = job["gen"][0] + golden = job["response"] response, golden = process_res(response), process_res(golden) - query = job['prompt'] + query = job["prompt"] job = {} - job['prompt'] = query - job['gen'] = response['action_input'] - job['response'] = golden['action_input'] + job["prompt"] = query + job["gen"] = response["action_input"] + job["response"] = golden["action_input"] - job['_gen_tok'], job['_gen_tok_str'] = _get_tokenized_string( - tokenizer, [response['action_input']]) - job['_reference_tok'], job['_reference_tok_str'] = _get_tokenized_string( - tokenizer, [golden['action_input']]) + job["_gen_tok"], job["_gen_tok_str"] = _get_tokenized_string( + tokenizer, [response["action_input"]] + ) + job["_reference_tok"], job["_reference_tok_str"] = _get_tokenized_string( + tokenizer, [golden["action_input"]] + ) - scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], - tokenizer=_DummyTokenizer()) - score = scorer.score(job['_reference_tok_str'][0], job['_gen_tok_str'][0]) + scorer = rouge_scorer.RougeScorer( + ["rouge1", "rouge2", "rougeL"], tokenizer=_DummyTokenizer() + ) + score = scorer.score(job["_reference_tok_str"][0], job["_gen_tok_str"][0]) - rouge = score['rougeL'].fmeasure + rouge = score["rougeL"].fmeasure return rouge @@ -124,24 +126,33 @@ class QWenAgent(Agent): agent.run("Draw me a picture of rivers and lakes.") ``` """ - def __init__(self, - chat_prompt_template=None, - run_prompt_template=None, - additional_tools=None, - tokenizer=None, - model=None): + + def __init__( + self, + chat_prompt_template=None, + run_prompt_template=None, + additional_tools=None, + tokenizer=None, + model=None, + ): if tokenizer and model: self.tokenizer = tokenizer self.model = model else: - checkpoint = 'Qwen/Qwen-7B-Chat' + checkpoint = "Qwen/Qwen-7B-Chat" self.tokenizer = AutoTokenizer.from_pretrained( - checkpoint, trust_remote_code=True) - self.model = AutoModelForCausalLM.from_pretrained( - checkpoint, device_map='auto', - trust_remote_code=True).cuda().eval() + checkpoint, trust_remote_code=True + ) + self.model = ( + AutoModelForCausalLM.from_pretrained( + checkpoint, device_map="auto", trust_remote_code=True + ) + .cuda() + .eval() + ) self.model.generation_config = GenerationConfig.from_pretrained( - checkpoint, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参 + checkpoint, trust_remote_code=True + ) # 可指定不同的生成长度、top_p等相关超参 self.model.generation_config.do_sample = False # greedy super().__init__( @@ -152,155 +163,161 @@ class QWenAgent(Agent): def generate_one(self, prompt, stop): # "Human:" 和 "Assistant:" 曾为通义千问的特殊保留字,需要替换为 "_HUMAN_:" 和 "_ASSISTANT_:"。这一问题将在未来版本修复。 - prompt = prompt.replace('Human:', - '_HUMAN_:').replace('Assistant:', - '_ASSISTANT_:') + prompt = prompt.replace("Human:", "_HUMAN_:").replace( + "Assistant:", "_ASSISTANT_:" + ) stop = [ - item.replace('Human:', '_HUMAN_:').replace('Assistant:', - '_ASSISTANT_:') + item.replace("Human:", "_HUMAN_:").replace("Assistant:", "_ASSISTANT_:") for item in stop ] result, _ = self.model.chat(self.tokenizer, prompt, history=None) for stop_seq in stop: if result.endswith(stop_seq): - result = result[:-len(stop_seq)] + result = result[: -len(stop_seq)] - result = result.replace('_HUMAN_:', - 'Human:').replace('_ASSISTANT_:', 'Assistant:') + result = result.replace("_HUMAN_:", "Human:").replace( + "_ASSISTANT_:", "Assistant:" + ) return result def load_models_tokenizer(args): - tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, - trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, - device_map='auto', - trust_remote_code=True, - bf16=True, - use_flash_attn=True).eval() + tokenizer = AutoTokenizer.from_pretrained( + args.checkpoint_path, trust_remote_code=True + ) + model = AutoModelForCausalLM.from_pretrained( + args.checkpoint_path, + device_map="auto", + trust_remote_code=True, + bf16=True, + use_flash_attn=True, + ).eval() model.generation_config = GenerationConfig.from_pretrained( - args.checkpoint_path, trust_remote_code=True) + args.checkpoint_path, trust_remote_code=True + ) model.generation_config.do_sample = False # use greedy decoding return model, tokenizer def load_jobs(filename): jobs = [] - with jsonlines.open(os.path.join(data_root_path, filename), - mode='r') as reader: + with jsonlines.open(os.path.join(data_root_path, filename), mode="r") as reader: for job in reader: jobs.append(job) return jobs def react_inference(filename, model, tokenizer): - filename_cache = filename + '.cache' + filename_cache = filename + ".cache" if os.path.exists(os.path.join(data_root_path, filename_cache)): jobs = load_jobs(filename=filename_cache) - print('Loaded from', filename_cache) + print("Loaded from", filename_cache) else: - with open(os.path.join(data_root_path, filename_cache), 'w') as f: + with open(os.path.join(data_root_path, filename_cache), "w") as f: jobs = load_jobs(filename=filename) - print('Inference:', filename) + print("Inference:", filename) for job in tqdm(jobs): - response, history = model.chat(tokenizer, - job['prompt'], - history=None) - job['gen'] = [response] - f.writelines(json.dumps(job, ensure_ascii=False) + '\n') - print(filename_cache, 'is saved.') + response, history = model.chat(tokenizer, job["prompt"], history=None) + job["gen"] = [response] + f.writelines(json.dumps(job, ensure_ascii=False) + "\n") + print(filename_cache, "is saved.") return jobs def main(args): - print('loading model weights') + print("loading model weights") if args.checkpoint_path is not None: model, tokenizer = load_models_tokenizer(args) else: model, tokenizer = None, None - print('model loaded') + print("model loaded") result = {} # eval react positive if args.eval_react_positive: - print('eval react positive ...') + print("eval react positive ...") acc_count = 0 rouge_mean = 0 - jobs = react_inference(filename=args.eval_react_positive_filename, - model=model, - tokenizer=tokenizer) + jobs = react_inference( + filename=args.eval_react_positive_filename, model=model, tokenizer=tokenizer + ) for job in jobs: if eval_action(job): acc_count += 1 rouge = eval_action_input(job, tokenizer) - rouge_mean += (rouge / len(jobs)) + rouge_mean += rouge / len(jobs) scores = { - 'action_right_rate': acc_count / len(jobs), - 'action_input_rouge': rouge_mean, + "action_right_rate": acc_count / len(jobs), + "action_input_rouge": rouge_mean, } - result.update({'react_positive': scores}) + result.update({"react_positive": scores}) # eval react negative if args.eval_react_negative: - print('eval react negative ...') + print("eval react negative ...") bad_count = 0 - jobs = react_inference(filename=args.eval_react_negative_filename, - model=model, - tokenizer=tokenizer) + jobs = react_inference( + filename=args.eval_react_negative_filename, model=model, tokenizer=tokenizer + ) for job in jobs: - if '\nAction:' in job['gen'][0]: + if "\nAction:" in job["gen"][0]: bad_count += 1 - scores = {'bad_rate': bad_count / len(jobs)} - result.update({'react_negative': scores}) + scores = {"bad_rate": bad_count / len(jobs)} + result.update({"react_negative": scores}) # eval hfagent if args.eval_hfagent: - print('eval hfagent ...') + print("eval hfagent ...") agent = QWenAgent(model=model, tokenizer=tokenizer) scores = evaluate_agent(agent, verbose=False, return_errors=False) - result.update({'hfagent': scores}) + result.update({"hfagent": scores}) pp = pprint.PrettyPrinter(indent=4) pp.pprint(result) -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Test HF checkpoint.') - parser.add_argument('-c', - '--checkpoint-path', - type=str, - help='Checkpoint path', - default='Qwen/Qwen-7B-Chat') - parser.add_argument('-s', - '--seed', - type=int, - default=1234, - help='Random seed') +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test HF checkpoint.") + parser.add_argument( + "-c", + "--checkpoint-path", + type=str, + help="Checkpoint path", + default="Qwen/Qwen-7B-Chat", + ) + parser.add_argument("-s", "--seed", type=int, default=1234, help="Random seed") """Provide extra arguments required for tasks.""" - group = parser.add_argument_group(title='Evaluation options') - group.add_argument('--eval-react-positive', - action='store_true', - default=False, - help='Eval react positive.') - group.add_argument('--eval-react-positive-filename', - type=str, - default='exam_plugin_v1_react_positive.jsonl', - help='Eval react positive filename.') - group.add_argument('--eval-react-negative', - action='store_true', - default=False, - help='Eval react negative.') - group.add_argument('--eval-react-negative-filename', - type=str, - default='exam_plugin_v1_react_negative.jsonl', - help='Eval react negative filename.') - group.add_argument('--eval-hfagent', - action='store_true', - default=False, - help='Eval hfagent.') + group = parser.add_argument_group(title="Evaluation options") + group.add_argument( + "--eval-react-positive", + action="store_true", + default=False, + help="Eval react positive.", + ) + group.add_argument( + "--eval-react-positive-filename", + type=str, + default="exam_plugin_v1_react_positive.jsonl", + help="Eval react positive filename.", + ) + group.add_argument( + "--eval-react-negative", + action="store_true", + default=False, + help="Eval react negative.", + ) + group.add_argument( + "--eval-react-negative-filename", + type=str, + default="exam_plugin_v1_react_negative.jsonl", + help="Eval react negative filename.", + ) + group.add_argument( + "--eval-hfagent", action="store_true", default=False, help="Eval hfagent." + ) args = parser.parse_args() set_seed(args.seed)