From 7eb9016908e982ecae140febd78ea626d1fa6714 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=85=BC=E6=AC=A3?= Date: Wed, 6 Dec 2023 12:57:11 +0800 Subject: [PATCH] update agent benchmarks and add qwen-72b results --- README.md | 210 +++++++++++----------------------------- README_CN.md | 210 +++++++++++----------------------------- README_ES.md | 210 +++++++++++----------------------------- README_FR.md | 210 +++++++++++----------------------------- README_JA.md | 210 +++++++++++----------------------------- eval/EVALUATION.md | 9 +- eval/evaluate_plugin.py | 6 +- 7 files changed, 304 insertions(+), 761 deletions(-) diff --git a/README.md b/README.md index 97e135d..9ad4c14 100644 --- a/README.md +++ b/README.md @@ -1066,22 +1066,28 @@ We have tested the model's tool calling capabilities on our open-source Chinese - + - + - + - + - + + + + + + +
Chinese Tool-Use BenchmarkChinese Tool-Use Benchmark (Version 20231206)
ModelTool Selection (Acc.↑)Tool Input (Rouge-L↑)False Positive Error↓
GPT-495%0.9015.0%GPT-498.0%0.95323.9%
GPT-3.585%0.8875.0%GPT-3.574.5%0.80780.6%
Qwen-7B-Chat98%0.917.3%Qwen-1_8B-Chat85.0%0.83927.6%
Qwen-14B-Chat98%0.932.4%Qwen-7B-Chat95.5%0.90011.6%
Qwen-14B-Chat96.9%0.9175.6%
Qwen-72B-Chat98.2%0.9271.1%
@@ -1091,127 +1097,85 @@ We have observed that Qwen performs well in terms of code executability and resu - + - + + + - + - + + + + + - - - - + + + + + - - - - - - - - - + + + + - - - - - - - - - + + + + - - - - - - - - - - - - - - + + + -
Executable Rate of Generated Code (%)Code Interpreter Benchmark (Version 20231206)
ModelMath↑Visualization↑General↑ModelAccuracy of Code Execution Results (%)Executable Rate of Code (%)
GPT-491.985.982.8Math↑Visualization-Hard↑Visualization-Easy↑General↑
GPT-3.589.265.074.1GPT-482.866.760.882.8
LLaMA2-7B-Chat41.933.124.1 GPT-3.547.333.355.774.1
LLaMA2-13B-Chat50.040.548.3
CodeLLaMA-7B-Instruct85.154.070.7 8.31.215.248.3
CodeLLaMA-13B-Instruct93.255.874.1
InternLM-7B-Chat-v1.178.444.262.1 28.215.521.574.1
InternLM-20B-Chat70.344.265.5
Qwen-7B-Chat82.464.467.2
Qwen-14B-Chat89.284.134.610.725.1 65.5
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + - - - - - - - - - - - - - - + + - + + - - + + + - - + + + + + + + + + +
Accuracy of Code Execution Results (%)
ModelMath↑Visualization-Hard↑Visualization-Easy↑
GPT-482.866.760.8
GPT-3.547.333.355.7
LLaMA2-7B-Chat3.914.339.2
LLaMA2-13B-Chat8.38.340.5
CodeLLaMA-7B-Instruct14.326.260.8 ChatGLM3-6B54.215.521.567.1
CodeLLaMA-13B-Instruct28.227.462.0
InternLM-7B-Chat-v1.128.54.840.5
InternLM-20B-Chat34.6Qwen-1.8B-Chat25.6 21.445.6 22.865.5
Qwen-7B-Chat 41.940.554.4 23.838.067.2
Qwen-14B-Chat 58.453.659.531.045.665.5
Qwen-72B-Chat72.741.743.082.8
@@ -1221,62 +1185,6 @@ We have observed that Qwen performs well in terms of code executability and resu

-In addition, we also provide experimental results demonstrating that our model is capable of acting as a HuggingFace Agent. For more information, please refer to the [example documentation](examples/transformers_agent.md). The model's performance on the evaluation dataset provided by Hugging Face is as follows: - - - - - - - - - - - - - - - - - - - - - - - - - - -
HuggingFace Agent Benchmark- Run Mode
ModelTool Selection↑Tool Used↑Code↑
GPT-410010097.4
GPT-3.595.496.387.0
StarCoder-Base-15B86.187.068.9
StarCoder-15B87.088.068.9
Qwen-7B-Chat87.087.071.5
Qwen-14B-Chat93.594.487.0
- - - - - - - - - - - - - - - - - - - - - - - - - - -
HuggingFace Agent Benchmark - Chat Mode
ModelTool Selection↑Tool Used↑Code↑
GPT-497.997.998.5
GPT-3.597.396.889.6
StarCoder-Base-15B97.997.991.1
StarCoder-15B97.997.989.6
Qwen-7B-Chat94.794.785.1
Qwen-14B-Chat97.997.995.5
-
## Long-Context Understanding diff --git a/README_CN.md b/README_CN.md index 40d99c8..1ee9f57 100644 --- a/README_CN.md +++ b/README_CN.md @@ -1059,22 +1059,28 @@ Qwen-Chat针对工具使用、函数调用能力进行了优化。用户可以 - + - + - + - + - + + + + + + +
中文工具调用评测基准中文工具调用评测基准(版本 20231206)
ModelTool Selection (Acc.↑)Tool Input (Rouge-L↑)False Positive Error↓
GPT-495%0.9015.0%GPT-498.0%0.95323.9%
GPT-3.585%0.8875.0%GPT-3.574.5%0.80780.6%
Qwen-7B-Chat98%0.917.3%Qwen-1_8B-Chat85.0%0.83927.6%
Qwen-14B-Chat98%0.932.4%Qwen-7B-Chat95.5%0.90011.6%
Qwen-14B-Chat96.9%0.9175.6%
Qwen-72B-Chat98.2%0.9271.1%
@@ -1083,127 +1089,85 @@ Qwen-Chat针对工具使用、函数调用能力进行了优化。用户可以 - + - + + + - + - + + + + + - - - - + + + + + - - - - - - - - - + + + + - - - - - - - - - + + + + - - - - - - - - - - - - - - + + + -
生成代码的可执行率 (%)Code Interpreter Benchmark (Version 20231206)
ModelMath↑Visualization↑General↑Model代码执行结果正确性 (%)生成代码的可执行率 (%)
GPT-491.985.982.8Math↑Visualization-Hard↑Visualization-Easy↑General↑
GPT-3.589.265.074.1GPT-482.866.760.882.8
LLaMA2-7B-Chat41.933.124.1 GPT-3.547.333.355.774.1
LLaMA2-13B-Chat50.040.548.3
CodeLLaMA-7B-Instruct85.154.070.7 8.31.215.248.3
CodeLLaMA-13B-Instruct93.255.874.1
InternLM-7B-Chat-v1.178.444.262.1 28.215.521.574.1
InternLM-20B-Chat70.344.265.5
Qwen-7B-Chat82.464.467.2
Qwen-14B-Chat89.284.134.610.725.1 65.5
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + - - - - - - - - - - - - - - + + - + + - - + + + - - + + + + + + + + + +
代码执行结果的正确率 (%)
ModelMath↑Visualization-Hard↑Visualization-Easy↑
GPT-482.866.760.8
GPT-3.547.333.355.7
LLaMA2-7B-Chat3.914.339.2
LLaMA2-13B-Chat8.38.340.5
CodeLLaMA-7B-Instruct14.326.260.8 ChatGLM3-6B54.215.521.567.1
CodeLLaMA-13B-Instruct28.227.462.0
InternLM-7B-Chat-v1.128.54.840.5
InternLM-20B-Chat34.6Qwen-1.8B-Chat25.6 21.445.6 22.865.5
Qwen-7B-Chat 41.940.554.4 23.838.067.2
Qwen-14B-Chat 58.453.659.531.045.665.5
Qwen-72B-Chat72.741.743.082.8
@@ -1213,62 +1177,6 @@ Qwen-Chat针对工具使用、函数调用能力进行了优化。用户可以

-此外,我们还提供了实验结果表明我们的模型具备扮演HuggingFace Agent的能力,详见[示例文档](examples/transformers_agent.md)了解更多信息。模型在Hugging Face提供的评测数据集上表现如下: - - - - - - - - - - - - - - - - - - - - - - - - - - -
HuggingFace Agent评测基准 - Run模式
ModelTool Selection↑Tool Used↑Code↑
GPT-410010097.4
GPT-3.595.496.387.0
StarCoder-Base-15B86.187.068.9
StarCoder-15B87.088.068.9
Qwen-7B-Chat87.087.071.5
Qwen-14B-Chat93.594.487.0
- - - - - - - - - - - - - - - - - - - - - - - - - - -
HuggingFace Agent评测基准 - Chat模式
ModelTool Selection↑Tool Used↑Code↑
GPT-497.997.998.5
GPT-3.597.396.889.6
StarCoder-Base-15B97.997.991.1
StarCoder-15B97.997.989.6
Qwen-7B-Chat94.794.785.1
Qwen-14B-Chat97.997.995.5
-
## 长文本理解 diff --git a/README_ES.md b/README_ES.md index 1855c89..2939aad 100644 --- a/README_ES.md +++ b/README_ES.md @@ -1026,22 +1026,28 @@ Hemos probado las capacidades de llamada de la herramienta del modelo en nuestro - + - + - + - + - + + + + + + +
Chinese Tool-Use BenchmarkChinese Tool-Use Benchmark (Version 20231206)
ModelTool Selection (Acc.↑)Tool Input (Rouge-L↑)False Positive Error↓
GPT-495%0.9015.0%GPT-498.0%0.95323.9%
GPT-3.585%0.8875.0%GPT-3.574.5%0.80780.6%
Qwen-7B-Chat98%0.917.3%Qwen-1_8B-Chat85.0%0.83927.6%
Qwen-14B-Chat98%0.932.4%Qwen-7B-Chat95.5%0.90011.6%
Qwen-14B-Chat96.9%0.9175.6%
Qwen-72B-Chat98.2%0.9271.1%
@@ -1051,127 +1057,85 @@ Hemos observado que Qwen funciona bien en términos de ejecutabilidad del códig - + - + + + - + - + + + + + - - - - + + + + + - - - - - - - - - + + + + - - - - - - - - - + + + + - - - - - - - - - - - - - - + + + -
Executable Rate of Generated Code (%)Code Interpreter Benchmark (Version 20231206)
ModelMath↑Visualization↑General↑ModelAccuracy of Code Execution Results (%)Executable Rate of Code (%)
GPT-491.985.982.8Math↑Visualization-Hard↑Visualization-Easy↑General↑
GPT-3.589.265.074.1GPT-482.866.760.882.8
LLaMA2-7B-Chat41.933.124.1 GPT-3.547.333.355.774.1
LLaMA2-13B-Chat50.040.548.3
CodeLLaMA-7B-Instruct85.154.070.7 8.31.215.248.3
CodeLLaMA-13B-Instruct93.255.874.1
InternLM-7B-Chat-v1.178.444.262.1 28.215.521.574.1
InternLM-20B-Chat70.344.265.5
Qwen-7B-Chat82.464.467.2
Qwen-14B-Chat89.284.134.610.725.1 65.5
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + - - - - - - - - - - - - - - + + - + + - - + + + - - + + + + + + + + + +
Accuracy of Code Execution Results (%)
ModelMath↑Visualization-Hard↑Visualization-Easy↑
GPT-482.866.760.8
GPT-3.547.333.355.7
LLaMA2-7B-Chat3.914.339.2
LLaMA2-13B-Chat8.38.340.5
CodeLLaMA-7B-Instruct14.326.260.8 ChatGLM3-6B54.215.521.567.1
CodeLLaMA-13B-Instruct28.227.462.0
InternLM-7B-Chat-v1.128.54.840.5
InternLM-20B-Chat34.6Qwen-1.8B-Chat25.6 21.445.6 22.865.5
Qwen-7B-Chat 41.940.554.4 23.838.067.2
Qwen-14B-Chat 58.453.659.531.045.665.5
Qwen-72B-Chat72.741.743.082.8
@@ -1181,62 +1145,6 @@ Hemos observado que Qwen funciona bien en términos de ejecutabilidad del códig

-Además, también proporcionamos resultados experimentales que demuestran que nuestro modelo es capaz de actuar como un Agente HuggingFace. Para más información, consulte la [documentación del ejemplo](examples/transformers_agent.md). El rendimiento del modelo en el conjunto de datos de evaluación proporcionado por Hugging Face es el siguiente: - - - - - - - - - - - - - - - - - - - - - - - - - - -
HuggingFace Agent Benchmark- Run Mode
ModelTool Selection↑Tool Used↑Code↑
GPT-410010097.4
GPT-3.595.496.387.0
StarCoder-Base-15B86.187.068.9
StarCoder-15B87.088.068.9
Qwen-7B-Chat87.087.071.5
Qwen-14B-Chat93.594.487.0
- - - - - - - - - - - - - - - - - - - - - - - - - - -
HuggingFace Agent Benchmark - Chat Mode
ModelTool Selection↑Tool Used↑Code↑
GPT-497.997.998.5
GPT-3.597.396.889.6
StarCoder-Base-15B97.997.991.1
StarCoder-15B97.997.989.6
Qwen-7B-Chat94.794.785.1
Qwen-14B-Chat97.997.995.5
-
## Comprensión del Contexto Largo diff --git a/README_FR.md b/README_FR.md index 19efd9e..38a3c43 100644 --- a/README_FR.md +++ b/README_FR.md @@ -1029,22 +1029,28 @@ Nous avons testé les capacités d'appel d'outil du modèle sur notre benchmark - + - + - + - + - + + + + + + +
Chinese Tool-Use BenchmarkChinese Tool-Use Benchmark (Version 20231206)
ModelTool Selection (Acc.↑)Tool Input (Rouge-L↑)False Positive Error↓
GPT-495%0.9015.0%GPT-498.0%0.95323.9%
GPT-3.585%0.8875.0%GPT-3.574.5%0.80780.6%
Qwen-7B-Chat98%0.917.3%Qwen-1_8B-Chat85.0%0.83927.6%
Qwen-14B-Chat98%0.932.4%Qwen-7B-Chat95.5%0.90011.6%
Qwen-14B-Chat96.9%0.9175.6%
Qwen-72B-Chat98.2%0.9271.1%
@@ -1054,127 +1060,85 @@ Nous avons observé que Qwen est performant en termes d'exécutabilité du code - + - + + + - + - + + + + + - - - - + + + + + - - - - - - - - - + + + + - - - - - - - - - + + + + - - - - - - - - - - - - - - + + + -
Executable Rate of Generated Code (%)Code Interpreter Benchmark (Version 20231206)
ModelMath↑Visualization↑General↑ModelAccuracy of Code Execution Results (%)Executable Rate of Code (%)
GPT-491.985.982.8Math↑Visualization-Hard↑Visualization-Easy↑General↑
GPT-3.589.265.074.1GPT-482.866.760.882.8
LLaMA2-7B-Chat41.933.124.1 GPT-3.547.333.355.774.1
LLaMA2-13B-Chat50.040.548.3
CodeLLaMA-7B-Instruct85.154.070.7 8.31.215.248.3
CodeLLaMA-13B-Instruct93.255.874.1
InternLM-7B-Chat-v1.178.444.262.1 28.215.521.574.1
InternLM-20B-Chat70.344.265.5
Qwen-7B-Chat82.464.467.2
Qwen-14B-Chat89.284.134.610.725.1 65.5
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + - - - - - - - - - - - - - - + + - + + - - + + + - - + + + + + + + + + +
Accuracy of Code Execution Results (%)
ModelMath↑Visualization-Hard↑Visualization-Easy↑
GPT-482.866.760.8
GPT-3.547.333.355.7
LLaMA2-7B-Chat3.914.339.2
LLaMA2-13B-Chat8.38.340.5
CodeLLaMA-7B-Instruct14.326.260.8 ChatGLM3-6B54.215.521.567.1
CodeLLaMA-13B-Instruct28.227.462.0
InternLM-7B-Chat-v1.128.54.840.5
InternLM-20B-Chat34.6Qwen-1.8B-Chat25.6 21.445.6 22.865.5
Qwen-7B-Chat 41.940.554.4 23.838.067.2
Qwen-14B-Chat 58.453.659.531.045.665.5
Qwen-72B-Chat72.741.743.082.8
@@ -1184,62 +1148,6 @@ Nous avons observé que Qwen est performant en termes d'exécutabilité du code

-En outre, nous fournissons également des résultats expérimentaux démontrant que notre modèle est capable d'agir en tant qu'agent Hugging Face. Pour plus d'informations, veuillez vous référer à la [documentation de l'exemple](examples/transformers_agent.md). Les performances du modèle sur l'ensemble des données d'évaluation fournies par Hugging Face sont les suivantes: - - - - - - - - - - - - - - - - - - - - - - - - - - -
HuggingFace Agent Benchmark- Run Mode
ModelTool Selection↑Tool Used↑Code↑
GPT-410010097.4
GPT-3.595.496.387.0
StarCoder-Base-15B86.187.068.9
StarCoder-15B87.088.068.9
Qwen-7B-Chat87.087.071.5
Qwen-14B-Chat93.594.487.0
- - - - - - - - - - - - - - - - - - - - - - - - - - -
HuggingFace Agent Benchmark - Chat Mode
ModelTool Selection↑Tool Used↑Code↑
GPT-497.997.998.5
GPT-3.597.396.889.6
StarCoder-Base-15B97.997.991.1
StarCoder-15B97.997.989.6
Qwen-7B-Chat94.794.785.1
Qwen-14B-Chat97.997.995.5
-
## Compréhension du Contexte Long diff --git a/README_JA.md b/README_JA.md index be232ed..e8646f6 100644 --- a/README_JA.md +++ b/README_JA.md @@ -1056,22 +1056,28 @@ ReAct プロンプトの原則に基づいてツール呼び出しを実装す - + - + - + - + - + + + + + + +
Chinese Tool-Use BenchmarkChinese Tool-Use Benchmark (Version 20231206)
ModelTool Selection (Acc.↑)Tool Input (Rouge-L↑)False Positive Error↓
GPT-495%0.9015.0%GPT-498.0%0.95323.9%
GPT-3.585%0.8875.0%GPT-3.574.5%0.80780.6%
Qwen-7B-Chat98%0.917.3%Qwen-1_8B-Chat85.0%0.83927.6%
Qwen-14B-Chat98%0.932.4%Qwen-7B-Chat95.5%0.90011.6%
Qwen-14B-Chat96.9%0.9175.6%
Qwen-72B-Chat98.2%0.9271.1%
@@ -1081,127 +1087,85 @@ Qwen は、コード生成時のコードの実行可能性と結果の精度の - + - + + + - + - + + + + + - - - - + + + + + - - - - - - - - - + + + + - - - - - - - - - + + + + - - - - - - - - - - - - - - + + + -
Executable Rate of Generated Code (%)Code Interpreter Benchmark (Version 20231206)
ModelMath↑Visualization↑General↑ModelAccuracy of Code Execution Results (%)Executable Rate of Code (%)
GPT-491.985.982.8Math↑Visualization-Hard↑Visualization-Easy↑General↑
GPT-3.589.265.074.1GPT-482.866.760.882.8
LLaMA2-7B-Chat41.933.124.1 GPT-3.547.333.355.774.1
LLaMA2-13B-Chat50.040.548.3
CodeLLaMA-7B-Instruct85.154.070.7 8.31.215.248.3
CodeLLaMA-13B-Instruct93.255.874.1
InternLM-7B-Chat-v1.178.444.262.1 28.215.521.574.1
InternLM-20B-Chat70.344.265.5
Qwen-7B-Chat82.464.467.2
Qwen-14B-Chat89.284.134.610.725.1 65.5
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + - - - - - - - - - - - - - - + + - + + - - + + + - - + + + + + + + + + +
Accuracy of Code Execution Results (%)
ModelMath↑Visualization-Hard↑Visualization-Easy↑
GPT-482.866.760.8
GPT-3.547.333.355.7
LLaMA2-7B-Chat3.914.339.2
LLaMA2-13B-Chat8.38.340.5
CodeLLaMA-7B-Instruct14.326.260.8 ChatGLM3-6B54.215.521.567.1
CodeLLaMA-13B-Instruct28.227.462.0
InternLM-7B-Chat-v1.128.54.840.5
InternLM-20B-Chat34.6Qwen-1.8B-Chat25.6 21.445.6 22.865.5
Qwen-7B-Chat 41.940.554.4 23.838.067.2
Qwen-14B-Chat 58.453.659.531.045.665.5
Qwen-72B-Chat72.741.743.082.8
@@ -1211,62 +1175,6 @@ Qwen は、コード生成時のコードの実行可能性と結果の精度の

-さらに、Qwenが HuggingFace Agent として機能できることを実証する実験結果も提供します。 詳細については、[ドキュメント例](examples/transformers_agent.md) を参照してください。 Hugging Face が提供する評価データセットにおけるモデルのパフォーマンスは次のとおりです。 - - - - - - - - - - - - - - - - - - - - - - - - - - -
HuggingFace Agent Benchmark- Run Mode
ModelTool Selection↑Tool Used↑Code↑
GPT-410010097.4
GPT-3.595.496.387.0
StarCoder-Base-15B86.187.068.9
StarCoder-15B87.088.068.9
Qwen-7B-Chat87.087.071.5
Qwen-14B-Chat93.594.487.0
- - - - - - - - - - - - - - - - - - - - - - - - - - -
HuggingFace Agent Benchmark - Chat Mode
ModelTool Selection↑Tool Used↑Code↑
GPT-497.997.998.5
GPT-3.597.396.889.6
StarCoder-Base-15B97.997.991.1
StarCoder-15B97.997.989.6
Qwen-7B-Chat94.794.785.1
Qwen-14B-Chat97.997.995.5
-
## 長い文脈の理解 diff --git a/eval/EVALUATION.md b/eval/EVALUATION.md index 5baeb4d..b939ad2 100644 --- a/eval/EVALUATION.md +++ b/eval/EVALUATION.md @@ -85,9 +85,12 @@ This script is used to reproduce the results of the ReAct and Hugging Face Agent # Qwen-7B-Chat mkdir data; cd data; -wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/opensource_data/exam_plugin_v1/exam_plugin_v1_react_positive.jsonl; -wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/opensource_data/exam_plugin_v1/exam_plugin_v1_react_negative.jsonl; -cd ..; +## Old Evaluation Dataset (Version 20230803) +# wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/opensource_data/exam_plugin_v1/exam_plugin_v1_react_positive.jsonl; +# wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/opensource_data/exam_plugin_v1/exam_plugin_v1_react_negative.jsonl; +## New Evaluation Dataset (Version 20231206) +wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/opensource_data/exam_plugin_v20231206/exam_plugin_v20231206_react_positive.jsonl; +wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/opensource_data/exam_plugin_v20231206/exam_plugin_v20231206_react_negative.jsonl;cd ..; pip install json5; pip install jsonlines; pip install rouge_score; diff --git a/eval/evaluate_plugin.py b/eval/evaluate_plugin.py index f3b953b..94d18aa 100644 --- a/eval/evaluate_plugin.py +++ b/eval/evaluate_plugin.py @@ -46,7 +46,7 @@ def process_res(response): ) except: # print("JSON Load Error:", action_input) - pass + action_input = "" res_dict = { "thought": thought, "action": action, @@ -80,7 +80,7 @@ def eval_action(job): response = job["gen"][0] golden = job["response"] - if "Action:" in response: + if "\nAction: " in response: response, golden = process_res(response), process_res(golden) if is_callable(response, golden): return True @@ -263,7 +263,7 @@ def main(args): filename=args.eval_react_negative_filename, model=model, tokenizer=tokenizer ) for job in jobs: - if "\nAction:" in job["gen"][0]: + if "\nAction: " in job["gen"][0]: bad_count += 1 scores = {"bad_rate": bad_count / len(jobs)} result.update({"react_negative": scores})