diff --git a/README.md b/README.md index ef33599..17deeb1 100644 --- a/README.md +++ b/README.md @@ -34,20 +34,20 @@ The following sections include information that you might find it helpful. Speci ## Performance -In general, Qwen-7B outperforms the baseline models of a similar model size, and even outperforms larger models of around 13B parameters, on a series of benchmark datasets, e.g., MMLU, C-Eval, GSM8K, HumanEval, and WMT22, etc., which evaluate the models' capabilities on natural language understanding, mathematic problem solving, coding, etc. See the results below. - -| Model | MMLU | C-Eval | GSM8K | HumanEval | WMT22 (en-zh) | -| :---------------- | :------------: | :------------: | :------------: | :------------: | :------------: | -| LLaMA-7B | 35.1 | - | 11.0 | 10.5 | 8.7 | -| LLaMA 2-7B | 45.3 | - | 14.6 | 12.8 | 17.9 | -| Baichuan-7B | 42.3 | 42.8 | 9.7 | 9.2 | 26.6 | -| ChatGLM2-6B | 47.9 | 51.7 | 32.4 | 9.2 | - | -| InternLM-7B | 51.0 | 52.8 | 31.2 | 10.4 | 14.8 | -| Baichuan-13B | 51.6 | 53.6 | 26.6 | 12.8 | 30.0 | -| LLaMA-13B | 46.9 | 35.5 | 17.8 | 15.8 | 12.0 | -| LLaMA 2-13B | 54.8 | - | 28.7 | 18.3 | 24.2 | -| ChatGLM2-12B | 56.2 | **61.6** | 40.9 | - | - | -| **Qwen-7B** | **56.7** | 59.6 | **51.6** | **24.4** | **30.6** | +In general, Qwen-7B outperforms the baseline models of a similar model size, and even outperforms larger models of around 13B parameters, on a series of benchmark datasets, e.g., MMLU, C-Eval, GSM8K, HumanEval, and WMT22, CMMLU, etc., which evaluate the models' capabilities on natural language understanding, mathematic problem solving, coding, etc. See the results below. + +| Model | MMLU | C-Eval | GSM8K | HumanEval | WMT22 (en-zh) | CMMLU | +| :---------------- | :------------: | :------------: | :------------: | :------------: | :------------: |:------------: | +| LLaMA-7B | 35.1 | - | 11.0 | 10.5 | 8.7 | - | +| LLaMA 2-7B | 45.3 | - | 14.6 | 12.8 | 17.9 | - | +| Baichuan-7B | 42.3 | 42.8 | 9.7 | 9.2 | 26.6 | 44.4 | +| ChatGLM2-6B | 47.9 | 51.7 | 32.4 | 9.2 | - | 48.8 | +| InternLM-7B | 51.0 | 52.8 | 31.2 | 10.4 | 14.8 | - | +| Baichuan-13B | 51.6 | 53.6 | 26.6 | 12.8 | 30.0 | 55.8 | +| LLaMA-13B | 46.9 | 35.5 | 17.8 | 15.8 | 12.0 | - | +| LLaMA 2-13B | 54.8 | - | 28.7 | 18.3 | 24.2 | - | +| ChatGLM2-12B | 56.2 | **61.6** | 40.9 | - | - | - | +| **Qwen-7B** | **56.7** | 59.6 | **51.6** | **24.4** | **30.6** | **58.8** |
diff --git a/README_CN.md b/README_CN.md index dfee8c5..e7a5e4e 100644 --- a/README_CN.md +++ b/README_CN.md @@ -33,20 +33,20 @@ ## 评测表现 -Qwen-7B在多个全面评估自然语言理解与生成、数学运算解题、代码生成等能力的评测数据集上,包括MMLU、C-Eval、GSM8K、HumanEval、WMT22等,均超出了同规模大语言模型的表现,甚至超出了如12-13B参数等更大规模的语言模型。 - -| Model | MMLU | C-Eval | GSM8K | HumanEval | WMT22 (en-zh) | -| :---------------- | :------------: | :------------: | :------------: | :------------: | :------------: | -| LLaMA-7B | 35.1 | - | 11.0 | 10.5 | 8.7 | -| LLaMA 2-7B | 45.3 | - | 14.6 | 12.8 | 17.9 | -| Baichuan-7B | 42.3 | 42.8 | 9.7 | 9.2 | 26.6 | -| ChatGLM2-6B | 47.9 | 51.7 | 32.4 | 9.2 | - | -| InternLM-7B | 51.0 | 52.8 | 31.2 | 10.4 | 14.8 | -| Baichuan-13B | 51.6 | 53.6 | 26.6 | 12.8 | 30.0 | -| LLaMA-13B | 46.9 | 35.5 | 17.8 | 15.8 | 12.0 | -| LLaMA 2-13B | 54.8 | - | 28.7 | 18.3 | 24.2 | -| ChatGLM2-12B | 56.2 | **61.6** | 40.9 | - | - | -| **Qwen-7B** | **56.7** | 59.6 | **51.6** | **24.4** | **30.6** | +Qwen-7B在多个全面评估自然语言理解与生成、数学运算解题、代码生成等能力的评测数据集上,包括MMLU、C-Eval、GSM8K、HumanEval、WMT22、CMMLU等,均超出了同规模大语言模型的表现,甚至超出了如12-13B参数等更大规模的语言模型。 + +| Model | MMLU | C-Eval | GSM8K | HumanEval | WMT22 (en-zh) | CMMLU | +| :---------------- | :------------: | :------------: | :------------: | :------------: | :------------: |:------------: | +| LLaMA-7B | 35.1 | - | 11.0 | 10.5 | 8.7 | - | +| LLaMA 2-7B | 45.3 | - | 14.6 | 12.8 | 17.9 | - | +| Baichuan-7B | 42.3 | 42.8 | 9.7 | 9.2 | 26.6 | 44.4 | +| ChatGLM2-6B | 47.9 | 51.7 | 32.4 | 9.2 | - | 48.8 | +| InternLM-7B | 51.0 | 52.8 | 31.2 | 10.4 | 14.8 | - | +| Baichuan-13B | 51.6 | 53.6 | 26.6 | 12.8 | 30.0 | 55.8 | +| LLaMA-13B | 46.9 | 35.5 | 17.8 | 15.8 | 12.0 | - | +| LLaMA 2-13B | 54.8 | - | 28.7 | 18.3 | 24.2 | - | +| ChatGLM2-12B | 56.2 | **61.6** | 40.9 | - | - | - | +| **Qwen-7B** | **56.7** | 59.6 | **51.6** | **24.4** | **30.6** | **58.8** |
diff --git a/README_JA.md b/README_JA.md index b755d87..aaa290d 100644 --- a/README_JA.md +++ b/README_JA.md @@ -37,20 +37,20 @@ Qwen-7Bは、アリババクラウドが提唱する大規模言語モデルシ ## パフォーマンス -一般的に、Qwen-7B は、MMLU、C-Eval、GSM8K、HumanEval、WMT22 などの自然言語理解、数学的問題解決、コーディングなどに関するモデルの能力を評価する一連のベンチマークデータセットにおいて、同程度のモデルサイズのベースラインモデルを凌駕し、さらには 13B 程度のパラメータを持つより大規模なモデルをも凌駕している。以下の結果をご覧ください。 - -| Model | MMLU | C-Eval | GSM8K | HumanEval | WMT22 (en-zh) | -| :---------------- | :------------: | :------------: | :------------: | :------------: | :------------: | -| LLaMA-7B | 35.1 | - | 11.0 | 10.5 | 8.7 | -| LLaMA 2-7B | 45.3 | - | 14.6 | 12.8 | 17.9 | -| Baichuan-7B | 42.3 | 42.8 | 9.7 | 9.2 | 26.6 | -| ChatGLM2-6B | 47.9 | 51.7 | 32.4 | 9.2 | - | -| InternLM-7B | 51.0 | 52.8 | 31.2 | 10.4 | 14.8 | -| Baichuan-13B | 51.6 | 53.6 | 26.6 | 12.8 | 30.0 | -| LLaMA-13B | 46.9 | 35.5 | 17.8 | 15.8 | 12.0 | -| LLaMA 2-13B | 54.8 | - | 28.7 | 18.3 | 24.2 | -| ChatGLM2-12B | 56.2 | **61.6** | 40.9 | - | - | -| **Qwen-7B** | **56.7** | 59.6 | **51.6** | **24.4** | **30.6** | +一般的に、Qwen-7B は、MMLU、C-Eval、GSM8K、HumanEval、WMT22、CMMLU などの自然言語理解、数学的問題解決、コーディングなどに関するモデルの能力を評価する一連のベンチマークデータセットにおいて、同程度のモデルサイズのベースラインモデルを凌駕し、さらには 13B 程度のパラメータを持つより大規模なモデルをも凌駕している。以下の結果をご覧ください。 + +| Model | MMLU | C-Eval | GSM8K | HumanEval | WMT22 (en-zh) | CMMLU | +| :---------------- | :------------: | :------------: | :------------: | :------------: | :------------: |:------------: | +| LLaMA-7B | 35.1 | - | 11.0 | 10.5 | 8.7 | - | +| LLaMA 2-7B | 45.3 | - | 14.6 | 12.8 | 17.9 | - | +| Baichuan-7B | 42.3 | 42.8 | 9.7 | 9.2 | 26.6 | 44.4 | +| ChatGLM2-6B | 47.9 | 51.7 | 32.4 | 9.2 | - | 48.8 | +| InternLM-7B | 51.0 | 52.8 | 31.2 | 10.4 | 14.8 | - | +| Baichuan-13B | 51.6 | 53.6 | 26.6 | 12.8 | 30.0 | 55.8 | +| LLaMA-13B | 46.9 | 35.5 | 17.8 | 15.8 | 12.0 | - | +| LLaMA 2-13B | 54.8 | - | 28.7 | 18.3 | 24.2 | - | +| ChatGLM2-12B | 56.2 | **61.6** | 40.9 | - | - | - | +| **Qwen-7B** | **56.7** | 59.6 | **51.6** | **24.4** | **30.6** | **58.8** |
diff --git a/eval/evaluate_cmmlu.py b/eval/evaluate_cmmlu.py new file mode 100644 index 0000000..aafcc57 --- /dev/null +++ b/eval/evaluate_cmmlu.py @@ -0,0 +1,271 @@ +import os +import pandas as pd +import numpy as np +import argparse +import datasets +import torch +from collections import defaultdict + +from typing import List +from tqdm import tqdm +from transformers.trainer_utils import set_seed + + +''' +wget https://huggingface.co/datasets/haonan-li/cmmlu/resolve/main/cmmlu_v1_0_1.zip +mkdir data/cmmlu +mv cmmlu_v1_0_1.zip data/cmmlu +cd data/cmmlu; unzip cmmlu_v1_0_1.zip +cd ../../ +python evaluate_cmmlu.py -d data/cmmlu/ +''' + +def load_models_tokenizer(args): + from transformers import AutoModelForCausalLM, AutoTokenizer + from transformers.generation import GenerationConfig + + tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(args.checkpoint_path, device_map="auto", trust_remote_code=True).eval() + model.generation_config = GenerationConfig.from_pretrained(args.checkpoint_path, trust_remote_code=True) + return model, tokenizer + + +def format_example(line, include_answer=True): + example = '问题:' + line['Question'] + for choice in choices: + example += f'\n{choice}. {line[f"{choice}"]}' + + if include_answer: + example += '\n答案:' + line["Answer"] + '\n\n' + else: + example += '\n答案:' + return example + + +def generate_few_shot_prompt(k, subject, dev_df): + prompt = '' + if k == -1: + k = dev_df.shape[0] + for i in range(k): + prompt += format_example( + dev_df.iloc[i, :], + include_answer=True, + ) + return prompt + + +def get_logits(tokenizer, model, inputs: List[str]): + input_ids = tokenizer(inputs, padding=False)['input_ids'] + input_ids = torch.tensor(input_ids, device=model.device) + tokens = {'input_ids': input_ids} + + outputs = model(input_ids)['logits'] + logits = outputs[:, -1, :] + log_probs = torch.nn.functional.softmax(logits, dim=-1) + return log_probs, {'tokens': tokens} + + +@torch.no_grad() +def eval_subject( + model, + tokenizer, + subject_name, + test_df, + k=5, + dev_df=None, + few_shot=False, + save_result_dir=None, + **kwargs +): + result = [] + score = [] + + few_shot_prompt = generate_few_shot_prompt( + k, subject_name, dev_df) if few_shot else [] + all_probs = {'prob_A': [], 'prob_B': [], 'prob_C': [], 'prob_D': []} + if args.debug: print(f"few_shot_prompt: {few_shot_prompt}") + + for _, row in tqdm(test_df.iterrows(), total=len(test_df)): + question = format_example(row, include_answer=False) + full_prompt = few_shot_prompt + question + + output, input_info = get_logits(tokenizer, model, [full_prompt]) + assert output.shape[0] == 1 + logits = output.flatten() + + softval = torch.nn.functional.softmax( + torch.tensor( + [ + logits[tokenizer("A")['input_ids']], + logits[tokenizer("B")['input_ids']], + logits[tokenizer("C")['input_ids']], + logits[tokenizer("D")['input_ids']], + ] + ), + dim=0, + ) + if softval.dtype in {torch.bfloat16, torch.float16}: + softval = softval.to(dtype=torch.float32) + probs = softval.detach().cpu().numpy() + + for i, choice in enumerate(choices): + all_probs[f'prob_{choice}'].append(probs[i]) + pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)] + + if 'Answer' in row: + correct = 1 if pred == row['Answer'] else 0 + score.append(correct) + if args.debug: print(f'{question} pred: {pred} ref: {row["Answer"]}') + result.append(pred) + + if score: + correct_ratio = 100 * sum(score) / len(score) + if args.debug: print(subject_name, correct_ratio) + else: + correct_ratio = 0 + if save_result_dir: + test_df['model_output'] = result + for i, choice in enumerate(choices): + test_df[f'prob_{choice}'] = (all_probs[f'prob_{choice}']) + if score: + test_df["correctness"] = score + os.makedirs(save_result_dir, exist_ok=True) + test_df.to_csv(os.path.join( + save_result_dir, f'{subject_name}_result.csv'), encoding="utf-8", index=False) + + return correct_ratio + + +def cal_cmmlu(res): + print('\n\n\n') + res = {k.split('-')[-1]:float(v) for k,v in res.items()} + for k, v in TASK_NAME_MAPPING.items(): + avg_acc = np.mean(list(map(lambda x: res[x], v))) + print(f"{k} acc: {avg_acc:.2f}") + avg_all_acc = np.mean(list(res.values())) + print(f"AVERAGE acc: {avg_all_acc:.2f}") + + +subcategories = { + "agronomy": ['other'], + "anatomy": ['biology'], + "ancient_chinese": ['linguistics','china specific'], + "arts": ['arts'], + "astronomy": ['physics'], + "business_ethics": ['business'], + "chinese_civil_service_exam": ['politics','china specific'], + "chinese_driving_rule": ['other','china specific'], + "chinese_food_culture": ['culture','china specific'], + "chinese_foreign_policy": ['politics','china specific'], + "chinese_history":['history','china specific'], + "chinese_literature": ['literature','china specific'], + "chinese_teacher_qualification": ['education','china specific'], + "college_actuarial_science":['math'], + "college_education":['education'], + "college_engineering_hydrology": ['engineering'], + "college_law": ['law'], + "college_mathematics": ['math'], + "college_medical_statistics":['statistics'], + "clinical_knowledge": ['other'], + "college_medicine": ['other'], + "computer_science": ['computer science'], + "computer_security": ['other'], + "conceptual_physics": ['physics'], + "construction_project_management": ['other','china specific'], + "economics": ['economics'], + "education": ['education'], + "elementary_chinese":['linguistics','china specific'], + "elementary_commonsense":['other','china specific'], + "elementary_information_and_technology": ['other'], + "electrical_engineering": ['engineering'], + "elementary_mathematics": ['math'], + "ethnology": ['culture','china specific'], + "food_science": ['other'], + "genetics": ['biology'], + "global_facts": ['global'], + "high_school_biology": ['biology'], + "high_school_chemistry": ['chemistry'], + "high_school_geography": ['geography'], + "high_school_mathematics": ['math'], + "high_school_physics": ['physics'], + "high_school_politics": ['politics','china specific'], + "human_sexuality": ['other'], + "international_law": ['law'], + "journalism": ['sociology'], + "jurisprudence": ['law'], + "legal_and_moral_basis": ['other'], + "logical": ['philosophy'], + "machine_learning": ['computer science'], + "management": ['business'], + "marketing": ['business'], + "marxist_theory": ['philosophy'], + "modern_chinese": ['linguistics','china specific'], + "nutrition": ['other'], + "philosophy": ['philosophy'], + "professional_accounting": ['business'], + "professional_law": ['law'], + "professional_medicine": ['other'], + "professional_psychology": ['psychology'], + "public_relations": ['politics'], + "security_study": ['politics'], + "sociology": ['culture'], + "sports_science": ['other'], + "traditional_chinese_medicine": ['other','china specific'], + "virology": ['biology'], + "world_history":['history'], + "world_religions": ['global'], +} + +categories = { + "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering", "statistics"], + "Humanities": ["history", "philosophy", "law", "arts", "literature", "global"], + "Social Science": ['linguistics',"business", "politics", "culture", "economics", "geography", "psychology", "education", "sociology"], + "Other":["other"], + "China specific": ["china specific"], +} + +TASK_NAME_MAPPING = defaultdict(list) +for k,v in categories.items(): + for subject, subcat in subcategories.items(): + for c in subcat: + if c in v: + TASK_NAME_MAPPING[k].append(subject) + + +choices = ["A", "B", "C", "D"] + + +def main(args): + model, tokenizer = load_models_tokenizer(args) + + test_result = {} + for subject_name in tqdm(subcategories.keys()): + dev_file_path = os.path.join(args.eval_data_path, 'dev', f'{subject_name}.csv') + test_file_path = os.path.join(args.eval_data_path, 'test', f'{subject_name}.csv') + dev_df = pd.read_csv(dev_file_path) + test_df = pd.read_csv(test_file_path) + + score = eval_subject(model, tokenizer, subject_name, dev_df=dev_df, test_df=test_df, k=5, few_shot=True, + save_result_dir=f"outs/cmmlu_eval_result") + test_result[subject_name] = score + cal_cmmlu(test_result) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Test HF checkpoint.') + parser.add_argument('-c', '--checkpoint-path', type=str, help='Checkpoint path', default="Qwen/Qwen-7B") + parser.add_argument('-s', '--seed', type=int, default=1234, help='Random seed') + + """Provide extra arguments required for tasks.""" + group = parser.add_argument_group(title='Evaluation options') + group.add_argument('-d', '--eval_data_path', type=str, required=True, + help='Path to eval data') + group.add_argument("--max-seq-len", type=int, default=2048, + help='Size of the output generated text.') + group.add_argument("--debug", action='store_true', default=False, + help='Print infos.') + + args = parser.parse_args() + set_seed(args.seed) + + main(args)