From 680a3e8bb8fe548397cf640a0bb60e1e01f3e852 Mon Sep 17 00:00:00 2001 From: "feihu.hf" Date: Fri, 4 Aug 2023 10:54:19 +0800 Subject: [PATCH] update EVALUATION.md --- eval/EVALUATION.md | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/eval/EVALUATION.md b/eval/EVALUATION.md index 09b009b..86dd3ed 100644 --- a/eval/EVALUATION.md +++ b/eval/EVALUATION.md @@ -8,7 +8,13 @@ mkdir data/ceval mv ceval-exam.zip data/ceval cd data/ceval; unzip ceval-exam.zip cd ../../ + +# Qwen-7B python evaluate_ceval.py -d data/ceval/ + +# Qwen-7B-Chat +pip install thefuzz +python evaluate_chat_ceval.py -d data/ceval/ ``` - MMLU @@ -19,7 +25,13 @@ mkdir data/mmlu mv data.tar data/mmlu cd data/mmlu; tar xf data.tar cd ../../ + +# Qwen-7B python evaluate_mmlu.py -d data/mmlu/data/ + +# Qwen-7B-Chat +pip install thefuzz +python evaluate_chat_mmlu.py -d data/mmlu/data/ ``` - HumanEval @@ -27,19 +39,28 @@ python evaluate_mmlu.py -d data/mmlu/data/ Get the HumanEval.jsonl file from [here](https://github.com/openai/human-eval/tree/master/data) ```Shell -python evaluate_humaneval.py -f HumanEval.jsonl -o HumanEval_res.jsonl git clone https://github.com/openai/human-eval pip install -e human-eval + +# Qwen-7B +python evaluate_humaneval.py -f HumanEval.jsonl -o HumanEval_res.jsonl evaluate_functional_correctness HumanEval_res.jsonl +# Qwen-7B-Chat +python evaluate_chat_mmlu.py -f HumanEval.jsonl -o HumanEval_res_chat.jsonl +evaluate_functional_correctness HumanEval_res_chat.jsonl ``` When installing package human-eval, please note its following disclaimer: This program exists to run untrusted model-generated code. Users are strongly encouraged not to do so outside of a robust security sandbox. The execution call in execution.py is deliberately commented out to ensure users read this disclaimer before running code in a potentially unsafe manner. See the comment in execution.py for more information and instructions. - - GSM8K ```Shell +# Qwen-7B python evaluate_gsm8k.py -``` \ No newline at end of file + +# Qwen-7B-Chat +python evaluate_chat_gsm8k.py # zeroshot +python evaluate_chat_gsm8k.py --use-fewshot # fewshot +```