parent
d275e5b91a
commit
ee01f36ed9
@ -0,0 +1,131 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "54d5d255-aa98-4655-8dd1-bc726430d86a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Qwen-7B-Chat Chat Demo"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "31e04af4-eb27-4802-a7b2-6ea0525f1dc8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This notebook uses Qwen-7B-Chat as an example to introduce you to how to build a web-based conversational assistant using Gradio."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "75e51155-9f8e-40dc-8432-60f4567d93a8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Preparation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ff6f061c-a033-49f2-8f7d-af3f23ac9125",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Download Qwen-7B-Chat\n",
|
||||
"\n",
|
||||
"Firstly, we need to download the model. You can use the snapshot_download that comes with modelscope to download the model to a specified directory."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c469a129-451f-4d01-8bc0-e2cf70a262c8",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install modelscope"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "69af626e-22b8-49ad-8869-8354f4c72bcc",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from modelscope.hub.snapshot_download import snapshot_download\n",
|
||||
"snapshot_download(\"qwen/Qwen-7B-Chat\",cache_dir='/tmp/models') "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "01d2ff34-4053-4710-a289-e354673be1ca",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Install Dependencies"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "48b51791-4bbc-4d12-9cd6-587c24c8bea7",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install -r ../../../requirements.txt\n",
|
||||
"!pip install gradio==3.37.0 mdtex2html"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7732037a-246a-4953-af07-dae7a3ae5937",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Run the web UI code to start the Qwen chatbot\n",
|
||||
"\n",
|
||||
"Users can run the web_demo.py file to have real-time conversations with Qwen-7b-chat on the webpage."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9e256f0a-d96d-4fd7-b305-fe43c6959dc8",
|
||||
"metadata": {
|
||||
"ExecutionIndicator": {
|
||||
"show": true
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!python ../../../web_demo.py -c /tmp/models/qwen/Qwen-7B-Chat"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
|
@ -0,0 +1,142 @@
|
||||
# Fine-tuning Qwen by Ascend NPU
|
||||
Below, we provide a simple example to show how to finetune Qwen by Ascend NPU. You can also refer to the official [mindformers](https://gitee.com/mindspore/mindformers/blob/dev/research/qwen/qwen.md) for detailed usage.
|
||||
|
||||
## Environment Requirement
|
||||
|
||||
- Hardware: Ascend 910A/B
|
||||
|
||||
## Quickstart
|
||||
|
||||
1. Launch Docker Image
|
||||
|
||||
```bash
|
||||
ImageID=pai-image-manage-registry.cn-wulanchabu.cr.aliyuncs.com/pai/llm-inference:qwen_v23.0.rc3
|
||||
docker run -it -u root --ipc=host \
|
||||
--device=/dev/davinci0 \
|
||||
--device=/dev/davinci1 \
|
||||
--device=/dev/davinci2 \
|
||||
--device=/dev/davinci3 \
|
||||
--device=/dev/davinci4 \
|
||||
--device=/dev/davinci5 \
|
||||
--device=/dev/davinci6 \
|
||||
--device=/dev/davinci7 \
|
||||
--device=/dev/davinci_manager \
|
||||
--device=/dev/devmm_svm \
|
||||
--device=/dev/hisi_hdc \
|
||||
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
|
||||
-v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
|
||||
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
|
||||
-v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \
|
||||
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
||||
-v /var/log/npu/:/usr/slog \
|
||||
-v /etc/hccn.conf:/etc/hccn.conf \
|
||||
${ImageID} /bin/bash
|
||||
```
|
||||
|
||||
2. Download and Convert model
|
||||
|
||||
- download model by modelscope
|
||||
|
||||
```bash
|
||||
cd mindformers
|
||||
python3 -c "from modelscope.hub.snapshot_download import snapshot_download; snapshot_download('Qwen/Qwen-7B-Chat', cache_dir='.', revision='master')"
|
||||
```
|
||||
|
||||
- convert hf model weights to ckpt weights
|
||||
|
||||
```bash
|
||||
python research/qwen/convert_weight.py \
|
||||
--torch_ckpt_dir Qwen/Qwen-7B-Chat \
|
||||
--mindspore_ckpt_path qwen-7b-chat.ckpt
|
||||
|
||||
mkdir -vp load_checkpoint/rank_0
|
||||
mv qwen-7b-chat.ckpt load_checkpoint/rank_0/
|
||||
```
|
||||
|
||||
3. Prepare training data
|
||||
|
||||
- download demo data
|
||||
|
||||
```bash
|
||||
wget -c https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/alpaca_data_min.json
|
||||
```
|
||||
|
||||
- Converts the raw data to the specified format
|
||||
|
||||
```bash
|
||||
python research/qwen/alpaca_converter.py \
|
||||
--data_path alpaca_data_min.json \
|
||||
--output_path alpaca-data-conversation_min.json
|
||||
```
|
||||
|
||||
- Generate Mindrecord data
|
||||
|
||||
```bash
|
||||
python research/qwen/qwen_preprocess.py \
|
||||
--input_glob alpaca-data-conversation_min.json \
|
||||
--model_file Qwen/Qwen-7B-Chat/qwen.tiktoken \
|
||||
--seq_length 1024 \
|
||||
--output_file alpaca_min.mindrecord
|
||||
```
|
||||
|
||||
4. Prepare RANK_TABLE_FILE
|
||||
|
||||
```bash
|
||||
# generate RANK_TABLE_FILE with 8 npu
|
||||
python mindformers/tools/hccl_tools.py --device_num "[0,8)"
|
||||
```
|
||||
|
||||
5. Fine-tune
|
||||
|
||||
You need to replace RANK_TABLE_FILE with the file generated in step 5.
|
||||
|
||||
```bash
|
||||
export MS_ASCEND_CHECK_OVERFLOW_MODE=INFNAN_MODE
|
||||
bash research/run_singlenode.sh "python3 research/qwen/run_qwen.py \
|
||||
--config research/qwen/run_qwen_7b.yaml \
|
||||
--load_checkpoint /mindformers/research/qwen/load_checkpoint \
|
||||
--vocab_file Qwen/Qwen-7B-Chat/qwen.tiktoken \
|
||||
--use_parallel True \
|
||||
--run_mode finetune \
|
||||
--auto_trans_ckpt True \
|
||||
--train_data alpaca_min.mindrecord" \
|
||||
RANK_TABLE_FILE [0,8] 8
|
||||
```
|
||||
|
||||
6. Merge model weights
|
||||
|
||||
- Rename model weights
|
||||
|
||||
```bash
|
||||
cd output/checkpoint_network
|
||||
mv rank_0/qwen_rank_0-network.ckpt rank_0/checkpoint_0.ckpt
|
||||
mv rank_1/qwen_rank_1-network.ckpt rank_1/checkpoint_1.ckpt
|
||||
mv rank_2/qwen_rank_2-network.ckpt rank_2/checkpoint_2.ckpt
|
||||
mv rank_3/qwen_rank_3-network.ckpt rank_3/checkpoint_3.ckpt
|
||||
mv rank_4/qwen_rank_4-network.ckpt rank_4/checkpoint_4.ckpt
|
||||
mv rank_5/qwen_rank_5-network.ckpt rank_5/checkpoint_5.ckpt
|
||||
mv rank_6/qwen_rank_6-network.ckpt rank_6/checkpoint_6.ckpt
|
||||
mv rank_7/qwen_rank_7-network.ckpt rank_7/checkpoint_7.ckpt
|
||||
cd ../..
|
||||
```
|
||||
|
||||
- Merge model weights
|
||||
|
||||
```bash
|
||||
python mindformers/tools/transform_ckpt.py \
|
||||
--src_ckpt_strategy output/strategy \
|
||||
--src_ckpt_dir output/checkpoint_network \
|
||||
--dst_ckpt_dir output/merged_model
|
||||
```
|
||||
|
||||
7. Inference fine-tuned model
|
||||
|
||||
```bash
|
||||
python research/qwen/run_qwen.py \
|
||||
--config research/qwen/run_qwen_7b.yaml \
|
||||
--predict_data '比较适合深度学习入门的书籍有' \
|
||||
--run_mode predict \
|
||||
--load_checkpoint output/merged_model/rank_0/checkpoint_0.ckpt \
|
||||
--auto_trans_ckpt False \
|
||||
--device_id 0
|
||||
```
|
@ -0,0 +1,2 @@
|
||||
deepspeed
|
||||
peft
|
@ -0,0 +1,198 @@
|
||||
## Introduction
|
||||
[SWIFT](https://github.com/modelscope/swift) (Scalable lightWeight Infrastructure for Fine-Tuning) is an extensible framwork designed to faciliate lightweight model fine-tuning and inference. It integrates implementations for various efficient fine-tuning methods, by embracing approaches that is parameter-efficient, memory-efficient, and time-efficient. SWIFT integrates seamlessly into ModelScope ecosystem and offers the capabilities to finetune various models, with a primary emphasis on LLMs and vision models. Additionally, SWIFT is fully compatible with PEFT, enabling users to leverage the familiar Peft interface to finetune ModelScope models.
|
||||
|
||||
## Installation
|
||||
|
||||
```shell
|
||||
# Set the global pip mirror
|
||||
pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/
|
||||
# Install ms-swift
|
||||
git clone https://github.com/modelscope/swift.git
|
||||
cd swift
|
||||
pip install -e .[llm]
|
||||
|
||||
# If you want to use deepspeed
|
||||
pip install deepspeed -U
|
||||
|
||||
# If you want to use qlora training based on auto_gptq (recommended, performs better than bnb)
|
||||
# Models supporting auto_gptq: `https://github.com/modelscope/swift/blob/main/docs/source/LLM/支持的模型和数据集.md#模型`
|
||||
# There's a version correspondence between auto_gptq and cuda; refer to `https://github.com/PanQiWei/AutoGPTQ#quick-installation` for selecting the appropriate version
|
||||
pip install auto_gptq -U
|
||||
|
||||
# If you want to use qlora training based on bnb
|
||||
pip install bitsandbytes -U
|
||||
|
||||
# Environment alignment (run the following commands if you encounter errors; the repository is tested with the latest environment)
|
||||
pip install -r requirements/framework.txt -U
|
||||
pip install -r requirements/llm.txt -U
|
||||
```
|
||||
|
||||
## WebUI Usage
|
||||
|
||||
Run the following command to start the webui and conduct model training and inference through the graphical interface:
|
||||
```shell
|
||||
swift web-ui
|
||||
```
|
||||
A screenshot example can be found at:
|
||||

|
||||
|
||||
## Fine-tuning
|
||||
|
||||
```python
|
||||
# Experimental environment: A10, 3090, V100, ...
|
||||
# GPU memory requirement: 20GB
|
||||
CUDA_VISIBLE_DEVICES=0 \
|
||||
swift sft \
|
||||
--model_id_or_path qwen/Qwen-7B-Chat \
|
||||
--dataset blossom-math-zh \
|
||||
--output_dir output \
|
||||
|
||||
# Use your own dataset
|
||||
CUDA_VISIBLE_DEVICES=0 \
|
||||
swift sft \
|
||||
--model_id_or_path qwen/Qwen-7B-Chat \
|
||||
--custom_train_dataset_path chatml.jsonl \
|
||||
--output_dir output \
|
||||
|
||||
# Using DDP (Distributed Data Parallel)
|
||||
# Experimental environment: 2 * 3090
|
||||
# GPU memory requirement: 2 * 23GB
|
||||
CUDA_VISIBLE_DEVICES=0,1 \
|
||||
NPROC_PER_NODE=2 \
|
||||
swift sft \
|
||||
--model_id_or_path qwen/Qwen-7B-Chat \
|
||||
--dataset blossom-math-zh \
|
||||
--output_dir output \
|
||||
|
||||
# Multi-machine multi-GPU setup
|
||||
# node0
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 \
|
||||
NNODES=2 \
|
||||
NODE_RANK=0 \
|
||||
MASTER_ADDR=127.0.0.1 \
|
||||
NPROC_PER_NODE=4 \
|
||||
swift sft \
|
||||
--model_id_or_path qwen/Qwen-7B-Chat \
|
||||
--dataset blossom-math-zh \
|
||||
--output_dir output \
|
||||
# node1
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 \
|
||||
NNODES=2 \
|
||||
NODE_RANK=1 \
|
||||
MASTER_ADDR=xxx.xxx.xxx.xxx \
|
||||
NPROC_PER_NODE=4 \
|
||||
swift sft \
|
||||
--model_id_or_path qwen/Qwen-7B-Chat \
|
||||
--dataset blossom-math-zh \
|
||||
--output_dir output \
|
||||
```
|
||||
For more fine-tuning methods, please refer to [here](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM%E5%BE%AE%E8%B0%83%E6%96%87%E6%A1%A3.md#%E5%BE%AE%E8%B0%83).
|
||||
|
||||
|
||||
|
||||
Examples
|
||||
|
||||
| 模型名称 | 训练方法 |
|
||||
|:-------------------|:---------------------------------------------------------------------------------------------------------------------------|
|
||||
| qwen_14b | [lora_ddp_ds](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_14b/lora_ddp_ds) |
|
||||
| qwen_14b | [qlora](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_14b/qlora) |
|
||||
| qwen_14b | [qlora_ddp_ds](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_14b/qlora_ddp_ds) |
|
||||
| qwen_14b_chat | [lora_ddp_ds](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_14b_chat/lora_ddp_ds) |
|
||||
| qwen_14b_chat | [qlora](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_14b_chat/qlora) |
|
||||
| qwen_14b_chat | [qlora_ddp_ds](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_14b_chat/qlora_ddp_ds) |
|
||||
| qwen_14b_chat_int4 | [qlora](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora) |
|
||||
| qwen_14b_chat_int4 | [qlora_ddp_ds](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora_ddp_ds) |
|
||||
| qwen_14b_chat_int8 | [qlora](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora) |
|
||||
| qwen_14b_chat_int8 | [qlora_ddp_ds](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora_ddp_ds) |
|
||||
| qwen_1_8b_chat | [full](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_1_8b_chat/full) |
|
||||
| qwen_1_8b_chat | [full_ddp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_1_8b_chat/full_ddp) |
|
||||
| qwen_72b_chat | [lora_mp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_72b_chat/lora_mp) |
|
||||
| qwen_72b_chat | [lora_mp_ddp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_72b_chat/lora_mp_ddp) |
|
||||
| qwen_72b_chat | [qlora](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_72b_chat/qlora) |
|
||||
| qwen_72b_chat_int4 | [qlora_ddp_ds](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_72b_chat_int4/qlora_ddp_ds) |
|
||||
| qwen_72b_chat_int8 | [qlora_ddp_ds](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_72b_chat_int8/qlora_ddp_ds) |
|
||||
| qwen_7b | [lora_ddp_ds](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds) |
|
||||
| qwen_7b | [qlora_ddp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b/qlora_ddp) |
|
||||
| qwen_7b_chat | [full](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat/full) |
|
||||
| qwen_7b_chat | [full_freeze_ddp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp) |
|
||||
| qwen_7b_chat | [full_mp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp) |
|
||||
| qwen_7b_chat | [full_mp_ddp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp) |
|
||||
| qwen_7b_chat | [lora](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat/lora) |
|
||||
| qwen_7b_chat | [lora_ddp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp) |
|
||||
| qwen_7b_chat | [lora_ddp_ds](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp_ds) |
|
||||
| qwen_7b_chat | [lora_mp_ddp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp) |
|
||||
| qwen_7b_chat | [qlora](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat/qlora) |
|
||||
| qwen_7b_chat | [qlora_ddp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp) |
|
||||
| qwen_7b_chat | [qlora_ddp_ds](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds) |
|
||||
| qwen_7b_chat_int4 | [qalora](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qalora) |
|
||||
| qwen_7b_chat_int4 | [qlora](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora) |
|
||||
| qwen_7b_chat_int4 | [qlora_ddp_ds](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora_ddp_ds) |
|
||||
| qwen_7b_chat_int8 | [qlora](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora) |
|
||||
| qwen_7b_chat_int8 | [qlora_ddp_ds](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora_ddp_ds) |
|
||||
| qwen_audio_chat | [full_mp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_audio_chat/full_mp) |
|
||||
| qwen_audio_chat | [full_mp_ddp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_audio_chat/full_mp_ddp) |
|
||||
| qwen_audio_chat | [lora](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_audio_chat/lora) |
|
||||
| qwen_audio_chat | [lora_ddp_ds](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_audio_chat/lora_ddp_ds) |
|
||||
| qwen_vl | [lora_ddp_ds](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_vl/lora_ddp_ds) |
|
||||
| qwen_vl_chat | [full_mp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_vl_chat/full_mp) |
|
||||
| qwen_vl_chat | [full_mp_ddp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_vl_chat/full_mp_ddp) |
|
||||
| qwen_vl_chat | [lora](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_vl_chat/lora) |
|
||||
| qwen_vl_chat | [lora_ddp_ds](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp_ds) |
|
||||
| qwen_vl_chat | [qlora](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_vl_chat/qlora) |
|
||||
| qwen_vl_chat_int4 | [qlora](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora) |
|
||||
| qwen_vl_chat_int4 | [qlora_ddp_ds](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora_ddp_ds) |
|
||||
|
||||
|
||||
## Inference
|
||||
|
||||
```python
|
||||
import os
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
||||
|
||||
from swift.llm import (
|
||||
get_model_tokenizer, get_template, inference, ModelType, get_default_template_type,
|
||||
)
|
||||
from swift.utils import seed_everything
|
||||
|
||||
model_type = ModelType.qwen_7b_chat
|
||||
template_type = get_default_template_type(model_type)
|
||||
print(f'template_type: {template_type}') # template_type: qwen
|
||||
|
||||
|
||||
kwargs = {}
|
||||
# kwargs['use_flash_attn'] = True # Use flash_attn if desired
|
||||
|
||||
model, tokenizer = get_model_tokenizer(model_type, model_kwargs={'device_map': 'auto'}, **kwargs)
|
||||
# Modify max_new_tokens
|
||||
model.generation_config.max_new_tokens = 128
|
||||
|
||||
template = get_template(template_type, tokenizer)
|
||||
seed_everything(42)
|
||||
query = 'What is the provincial capital of Zhejiang?'
|
||||
response, history = inference(model, template, query)
|
||||
print(f'query: {query}')
|
||||
print(f'response: {response}')
|
||||
|
||||
query = 'What delicious food can be found here?'
|
||||
response, history = inference(model, template, query, history)
|
||||
print(f'query: {query}')
|
||||
print(f'response: {response}')
|
||||
print(f'history: {history}')
|
||||
|
||||
"""Output[0]:
|
||||
query: What is the provincial capital of Zhejiang?
|
||||
response: The provincial capital of Zhejiang is Hangzhou.
|
||||
query: What delicious food can be found here?
|
||||
response: Hangzhou has many famous delicacies, such as West Lake Vinegar Fish, Longjing Shrimp, Sweet and Sour Spare Ribs, and Maodu. Additionally, there are unique Hangzhou-style pastries like Osmanthus Cake, Lotus Paste Pastry, and Aiwo Steamed Rice Cakes.
|
||||
history: [('What is the provincial capital of Zhejiang?', 'The provincial capital of Zhejiang is Hangzhou.'), ('What delicious food can be found here?', 'Hangzhou has many famous delicacies, such as West Lake Vinegar Fish, Longjing Shrimp, Sweet and Sour Spare Ribs, and Maodu. Additionally, there are unique Hangzhou-style pastries like Osmanthus Cake, Lotus Paste Pastry, and Aiwo Steamed Rice Cakes.')]
|
||||
"""
|
||||
|
||||
# Streaming dialogue output with verbose mode
|
||||
inference(model, template, 'What was the first question?', history, verbose=True, stream=True)
|
||||
"""Output[1]:
|
||||
[PROMPT]
|
||||
You asked your first question, "What is the provincial capital of Zhejiang?"
|
||||
[OUTPUT] Your first question was “What is the provincial capital of Zhejiang?”
|
||||
"""
|
||||
|
||||
For more on inference usage, please refer to [here](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM_Inference_Guide.md).
|
@ -0,0 +1,56 @@
|
||||
# Inference Qwen Using DashScope
|
||||
|
||||
The most simple way to use Qwen through APIs is DashScope API service through Alibaba Cloud. We give an introduction to the usage. Additionally, we provide a script for you to deploy an OpenAI-style API on your own servers.
|
||||
|
||||
DashScope is the large language model API service provided by Alibaba Cloud, which now supports Qwen. Note that the models behind DashScope are in-house versions temporarily without details provided. The services include `qwen-turbo` and `qwen-plus`, where the former one runs faster and the latter achieves better performance. For more information, visit the documentation [here](https://dashscope.aliyun.com).
|
||||
|
||||
Please head to the official website [link](https://help.aliyun.com/zh/dashscope/developer-reference/activate-dashscope-and-create-an-api-key?spm=a2c4g.11186623.0.0.6c2774fahtfXdn) to create a DashScope account and obtain the API key (AK). We recommend setting the AK with an environment variable:
|
||||
```bash
|
||||
export DASHSCOPE_API_KEY="YOUR_DASHSCOPE_API_KEY"
|
||||
```
|
||||
Then please install the packages and click [here](https://help.aliyun.com/zh/dashscope/developer-reference/install-dashscope-sdk) for the documentation. If you use Python, you can install DashScope with pip:
|
||||
```bash
|
||||
pip install dashscope
|
||||
```
|
||||
If you use JAVA SDK, you can install it in this way:
|
||||
```xml
|
||||
<!-- https://mvnrepository.com/artifact/com.alibaba/dashscope-sdk-java -->
|
||||
<dependency>
|
||||
<groupId>com.alibaba</groupId>
|
||||
<artifactId>dashscope-sdk-java</artifactId>
|
||||
<version>the-latest-version</version>
|
||||
</dependency>
|
||||
```
|
||||
The simplest way to use DashScope is the usage with messages, which is similar to OpenAI API. The example is demonstrated below:
|
||||
```python
|
||||
import random
|
||||
from http import HTTPStatus
|
||||
from dashscope import Generation
|
||||
|
||||
|
||||
def call_with_messages():
|
||||
messages = [{'role': 'system', 'content': 'You are a helpful assistant.'},
|
||||
{'role': 'user', 'content': '如何做西红柿鸡蛋?'}]
|
||||
gen = Generation()
|
||||
response = gen.call(
|
||||
Generation.Models.qwen_turbo,
|
||||
messages=messages,
|
||||
seed=random.randint(1, 10000), # set the random seed, optional, default to 1234 if not set
|
||||
result_format='message', # set the result to be "message" format.
|
||||
)
|
||||
return response
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
response = call_with_messages()
|
||||
if response.status_code == HTTPStatus.OK:
|
||||
print(response)
|
||||
else:
|
||||
print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
|
||||
response.request_id, response.status_code,
|
||||
response.code, response.message
|
||||
))
|
||||
```
|
||||
For more usages, please visit the official website for more details.
|
||||
<br><br>
|
||||
|
@ -0,0 +1,113 @@
|
||||
# Quantization
|
||||
|
||||
## GPTQ
|
||||
|
||||
We provide a solution based on [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), and release the Int4 and Int8 quantized models, which achieve nearly lossless model effects but improved performance on both memory costs and inference speed.
|
||||
|
||||
Here we demonstrate how to use our provided quantized models for inference. Before you start, make sure you meet the requirements of auto-gptq (e.g., torch 2.0 and above, transformers 4.32.0 and above, etc.) and install the required packages:
|
||||
|
||||
```bash
|
||||
pip install auto-gptq optimum
|
||||
```
|
||||
|
||||
If you meet problems installing `auto-gptq`, we advise you to check out the official [repo](https://github.com/PanQiWei/AutoGPTQ) to find a wheel.
|
||||
|
||||
> Note: The pre-compiled `auto-gptq` packages strongly depend on the version of `torch` and its CUDA version. Moreover, due to recent update,
|
||||
> you may also encounter unsupported version errors from `transformers`, `optimum`, or `peft`.
|
||||
> We recommend using the latest versions meeting the following requirements:
|
||||
> - torch==2.1 auto-gptq>=0.5.1 transformers>=4.35.0 optimum>=1.14.0 peft>=0.6.1
|
||||
> - torch>=2.0,<2.1 auto-gptq<0.5.0 transformers<4.35.0 optimum<1.14.0 peft>=0.5.0,<0.6.0
|
||||
|
||||
Then you can load the quantized model easily and run inference as same as usual:
|
||||
|
||||
```python
|
||||
# Model names: "Qwen/Qwen-7B-Chat-Int4", "Qwen/Qwen-14B-Chat-Int4"
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"Qwen/Qwen-7B-Chat-Int4",
|
||||
device_map="auto",
|
||||
trust_remote_code=True
|
||||
).eval()
|
||||
response, history = model.chat(tokenizer, "Hi", history=None)
|
||||
```
|
||||
|
||||
We illustrate the model performance of both BF16, Int8 and Int4 models on the benchmark, and we find that the quantized model does not suffer from significant performance degradation. Results are shown below:
|
||||
|
||||
| Quantization | MMLU | CEval (val) | GSM8K | Humaneval |
|
||||
|----------------------|:----:|:-----------:|:-----:|:---------:|
|
||||
| Qwen-1.8B-Chat (BF16)| 43.3 | 55.6 | 33.7 | 26.2 |
|
||||
| Qwen-1.8B-Chat (Int8)| 43.1 | 55.8 | 33.0 | 27.4 |
|
||||
| Qwen-1.8B-Chat (Int4)| 42.9 | 52.8 | 31.2 | 25.0 |
|
||||
| Qwen-7B-Chat (BF16) | 55.8 | 59.7 | 50.3 | 37.2 |
|
||||
| Qwen-7B-Chat (Int8) | 55.4 | 59.4 | 48.3 | 34.8 |
|
||||
| Qwen-7B-Chat (Int4) | 55.1 | 59.2 | 49.7 | 29.9 |
|
||||
| Qwen-14B-Chat (BF16) | 64.6 | 69.8 | 60.1 | 43.9 |
|
||||
| Qwen-14B-Chat (Int8) | 63.6 | 68.6 | 60.0 | 48.2 |
|
||||
| Qwen-14B-Chat (Int4) | 63.3 | 69.0 | 59.8 | 45.7 |
|
||||
| Qwen-72B-Chat (BF16) | 74.4 | 80.1 | 76.4 | 64.6 |
|
||||
| Qwen-72B-Chat (Int8) | 73.5 | 80.1 | 73.5 | 62.2 |
|
||||
| Qwen-72B-Chat (Int4) | 73.4 | 80.1 | 75.3 | 61.6 |
|
||||
|
||||
## Quantization of KV cache
|
||||
|
||||
> NOTE: Please be aware that due to the internal mechanism of Hugging Face, the support files for this functionality
|
||||
> (i.e., `cache_autogptq_cuda_256.cpp` and `cache_autogptq_cuda_kernel_256.cu`) may be missing. Please manually download
|
||||
> them from the Hugging Face Hub and place them into the same folder as the other module files.
|
||||
|
||||
The attention KV cache can be quantized and compressed for storage, to get a higher sample throughput. The arguments `use_cache_quantization` and `use_cache_kernel` in `config.json` are provided to enable KV cache quantization. The specific use method is as follows:
|
||||
```python
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"Qwen/Qwen-7B-Chat",
|
||||
device_map="auto",
|
||||
trust_remote_code=True,
|
||||
use_cache_quantization=True,
|
||||
use_cache_kernel=True,
|
||||
use_flash_attn=False
|
||||
)
|
||||
```
|
||||
Attention: Currently, KV cache quantization and flash attention cannot be used at the same time.
|
||||
If you enable KV cache quantization and flash attention at the same time (`use_flash_attn=True`, `use_cache_quantization=True`, `use_cache_kernel=True`), `use_flash_attn` is disabled by default (`use_flash_attn=false`).
|
||||
|
||||
We have verified that the use of the quantized Int8-KV-Cache model does not suffer from significant performance degradation in downstream evaluation. In the following, we focus on profiling its memory footprint in different conditions.
|
||||
The profiling runs on a single A100-SXM4-80G GPU with PyTorch 2.0.1 and CUDA 11.4.
|
||||
We use BF16 models to generate 1024 tokens by default, and "OOM" indicates out-of-memory error.
|
||||
|
||||
With KV cache quantization, the model can infer with a larger batch size (bs).
|
||||
|
||||
| USE KV Cache | bs=1 | bs=4 | bs=16 | bs=32 | bs=64 | bs=100 |
|
||||
|--------------|:------:|:------:|:------:|:------:|:------:|:------:|
|
||||
| No | 16.3GB | 24.1GB | 31.7GB | 48.7GB | OOM | OOM |
|
||||
| Yes | 15.5GB | 17.2GB | 22.3GB | 30.2GB | 48.2GB | 72.4GB |
|
||||
|
||||
With KV cache quantization the model can save more memory when generating longer sequence (`sl`, sequence length, referring to the number of tokens generated) at the stage of inference.
|
||||
|
||||
| USE KV Cache | sl=512 | sl=1024 | sl=2048 | sl=4096 | sl=8192 |
|
||||
|--------------|:------:|:-------:|:-------:|:-------:|:-------:|
|
||||
| No | 15.2GB | 16.3GB | 17.6GB | 19.5GB | 23.2GB |
|
||||
| Yes | 15GB | 15.5GB | 15.8GB | 16.6GB | 17.6GB |
|
||||
|
||||
The model with KV cache quantization will convert the format of `layer_past` from float to int8, and meanwhile the quantized `layer-past` will also store the quantization parameters.
|
||||
|
||||
Specific steps are as follows:
|
||||
|
||||
1. Quantize key/value
|
||||
```
|
||||
qv,scale,zero_point=quantize_cache_v(v)
|
||||
```
|
||||
2. Store into layer_past
|
||||
|
||||
The following is the format of quantized `layer_past`:
|
||||
```
|
||||
layer_past=((q_key,key_scale,key_zero_point),
|
||||
(q_value,value_scale,value_zero_point))
|
||||
```
|
||||
|
||||
The original format of `layer_past` is shown below:
|
||||
```
|
||||
layer_past=(key,value)
|
||||
```
|
||||
|
||||
If you want to use the attention KV which is quantized, you can use the dequantization operation to convert the Int8 key/value back to the float format as follows:
|
||||
```
|
||||
v=dequantize_cache_torch(qv,scale,zero_point)
|
||||
```
|
||||
<br>
|
@ -0,0 +1,46 @@
|
||||
# Inference Qwen Using TensorRT-LLM
|
||||
Below, we provide a simple example to show how to inference Qwen by TensorRT-LLM. We recommend using GPUs with compute capability of at least SM_80 such as A10 and A800 to run this example, as we have tested on these GPUs. You can find your gpu compute capability on this [link](https://developer.nvidia.com/cuda-gpus).
|
||||
|
||||
## Installation
|
||||
You can use pre-built docker image to run this example. Simultaneously, You can also refer to the official [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) for installation and detailed usage.
|
||||
```bash
|
||||
docker run --gpus all -it --ipc=host --network=host pai-image-manage-registry.cn-wulanchabu.cr.aliyuncs.com/pai/llm-inference:tensorrt-llm-0.8.0 bash
|
||||
```
|
||||
## Quickstart
|
||||
1. Download model by modelscope
|
||||
|
||||
```bash
|
||||
cd TensorRT-LLM/examples/qwen
|
||||
python3 -c "from modelscope.hub.snapshot_download import snapshot_download; snapshot_download('Qwen/Qwen-1_8B-Chat', cache_dir='.', revision='master')"
|
||||
mkdir -p ./tmp/Qwen
|
||||
mv Qwen/Qwen-1_8B-Chat ./tmp/Qwen/1_8B
|
||||
```
|
||||
|
||||
2. Build TensorRT engine from HF checkpoint
|
||||
|
||||
```bash
|
||||
python3 build.py --hf_model_dir ./tmp/Qwen/1_8B/ \
|
||||
--dtype float16 \
|
||||
--remove_input_padding \
|
||||
--use_gpt_attention_plugin float16 \
|
||||
--enable_context_fmha \
|
||||
--use_gemm_plugin float16 \
|
||||
--output_dir ./tmp/Qwen/1_8B/trt_engines/fp16/1-gpu/
|
||||
```
|
||||
|
||||
3. Inference
|
||||
```bash
|
||||
python3 ../run.py --input_text "你好,请问你叫什么?" \
|
||||
--max_output_len=512 \
|
||||
--tokenizer_dir ./tmp/Qwen/1_8B/ \
|
||||
--engine_dir=./tmp/Qwen/1_8B/trt_engines/fp16/1-gpu
|
||||
```
|
||||
```
|
||||
Input [Text 0]: "<|im_start|>system
|
||||
You are a helpful assistant.<|im_end|>
|
||||
<|im_start|>user
|
||||
你好,请问你叫什么?<|im_end|>
|
||||
<|im_start|>assistant
|
||||
"
|
||||
Output [Text 0 Beam 0]: "你好,我是来自阿里云的大规模语言模型,我叫通义千问。"
|
||||
```
|
@ -0,0 +1,184 @@
|
||||
# Inference Qwen Using vLLM
|
||||
|
||||
For deployment and fast inference, we suggest using vLLM.
|
||||
|
||||
## Installation
|
||||
|
||||
If you use cuda 12.1 and pytorch 2.1, you can directly use the following command to install vLLM.
|
||||
```bash
|
||||
# Install vLLM with CUDA 12.1.
|
||||
pip install vllm
|
||||
```
|
||||
Otherwise, please refer to the official vLLM [Installation Instructions](https://docs.vllm.ai/en/latest/getting_started/installation.html).
|
||||
|
||||
If you have trouble building vLLM, we recommend using Docker image.
|
||||
|
||||
```bash
|
||||
docker run --gpus all -it --rm --ipc=host --network=host qwenllm/qwen:cu121 bash
|
||||
```
|
||||
|
||||
## GPU Requirements
|
||||
|
||||
Qwen model use Bfloat16 by default, but Bfloat16 is only supported on GPUs with compute capability of at least 8. For GPUs with compute capability less than 8.0, it is recommended to set the dtype to float16. You can find your gpu compute capability on this [link](https://developer.nvidia.com/cuda-gpus).
|
||||
|
||||
We have tested the GPU memory usage on NVIDIA Tesla V100 32GB by manually adjusting gpu-memory-utilization in eager mode, you can refer to the following table to determine whether your machine is capable of running these models.
|
||||
| Model | seq_len 2048 | seq_len 8192 | seq_len 16384 | seq_len 32768 |
|
||||
| :--- | ---: | ---: | ---: | ---: |
|
||||
| Qwen-1.8B | 6.22G | 7.46G | | |
|
||||
| Qwen-7B | 17.94G | 20.96G | | |
|
||||
| Qwen-7B-Int4 | 9.10G | 12.26G | | |
|
||||
| Qwen-14B | 33.40G | | | |
|
||||
| Qwen-14B-Int4 | 13.30G | | | |
|
||||
| Qwen-72B | 166.87G | 185.50G | 210.80G | 253.80G |
|
||||
| Qwen-72B-int4 | 55.37G | 73.66G | 97.79G | 158.80G |
|
||||
|
||||
We have also listed the models that can run on consumer graphics cards by default sequence length in the following table. If the GPU memory only exceeds the model's memory usage by a small margin, you can make the model run on your machine by reducing the max-model-len parameter.</br>
|
||||
(ps: To run Qwen-14B-Int4 on NVIDIA RTX 3080Ti, you need to set gpu-memory-utilization as 0.99 and enforce eager mode)
|
||||
|
||||
| GPU Memory | GPU | Support Model |
|
||||
| :---: | :---: | :---: |
|
||||
| 24GB | NVIDIA RTX 4090/3090/A5000 | Qwen-1.8B/Qwen-7B/Qwen-7B-Int4/Qwen-14B-Int4 |
|
||||
| 16GB | NVIDIA RTX A4000 | Qwen-1.8B/Qwen-7B-Int4/Qwen-14B-Int4 |
|
||||
| 12GB | NVIDIA RTX 3080Ti/TITAN Xp | Qwen-1.8B/Qwen-14B-Int4 |
|
||||
| 11GB | NVIDIA RTX 2080Ti/GTX 1080Ti | Qwen-1.8B |
|
||||
| 10GB | NVIDIA RTX 3080 | Qwen-1.8B |
|
||||
|
||||
## Usage
|
||||
|
||||
### vLLM + Web Demo / OpenAI-like API
|
||||
|
||||
You can use FastChat to launch a web demo or an OpenAI API server. First, install FastChat:
|
||||
|
||||
```bash
|
||||
pip install "fschat[model_worker,webui]=0.2.33" "openai<1.0"
|
||||
```
|
||||
|
||||
To run Qwen with vLLM and FastChat, you need launch a controller by:
|
||||
```bash
|
||||
python -m fastchat.serve.controller
|
||||
```
|
||||
|
||||
Then you can launch the model worker, which means loading your model for inference. For single GPU inference, you can directly run:
|
||||
```bash
|
||||
python -m fastchat.serve.vllm_worker --model-path $model_path --trust-remote-code --dtype bfloat16
|
||||
# run int4 model or GPUs with compute capability less than 8.0
|
||||
# python -m fastchat.serve.vllm_worker --model-path $model_path --trust-remote-code --dtype float16
|
||||
```
|
||||
|
||||
However, if you hope to run the model on multiple GPUs for faster inference or larger memory, you can use tensor parallelism supported by vLLM. Suppose you run the model on 4 GPUs, the command is shown below:
|
||||
```bash
|
||||
python -m fastchat.serve.vllm_worker --model-path $model_path --trust-remote-code --tensor-parallel-size 4 --dtype bfloat16
|
||||
# run int4 model or GPUs with compute capability less than 8.0
|
||||
# python -m fastchat.serve.vllm_worker --model-path $model_path --trust-remote-code --tensor-parallel-size 4 --dtype float16
|
||||
```
|
||||
|
||||
After launching your model worker, you can launch a:
|
||||
|
||||
* Web UI Demo
|
||||
```bash
|
||||
python -m fastchat.serve.gradio_web_server
|
||||
```
|
||||
|
||||
* OpenAI API
|
||||
```bash
|
||||
python -m fastchat.serve.openai_api_server --host localhost --port 8000
|
||||
```
|
||||
|
||||
For OpenAI API server, you can invoke the server in the following manner.
|
||||
|
||||
```python
|
||||
import openai
|
||||
openai.api_base = "http://localhost:8000/v1"
|
||||
openai.api_key = "none"
|
||||
|
||||
# create a request activating streaming response
|
||||
for chunk in openai.ChatCompletion.create(
|
||||
model="Qwen",
|
||||
messages=[
|
||||
{"role": "user", "content": "你好"}
|
||||
],
|
||||
stream=True
|
||||
# Specifying stop words in streaming output format is not yet supported and is under development.
|
||||
):
|
||||
if hasattr(chunk.choices[0].delta, "content"):
|
||||
print(chunk.choices[0].delta.content, end="", flush=True)
|
||||
|
||||
# create a request not activating streaming response
|
||||
response = openai.ChatCompletion.create(
|
||||
model="Qwen",
|
||||
messages=[
|
||||
{"role": "user", "content": "你好"}
|
||||
],
|
||||
stream=False,
|
||||
stop=[] # You can add custom stop words here, e.g., stop=["Observation:"] for ReAct prompting.
|
||||
)
|
||||
print(response.choices[0].message.content)
|
||||
```
|
||||
|
||||
If you find `"POST /v1/chat/completions HTTP/1.1" 200 OK` in openai_api_server log, it indicates that the call was successful.
|
||||
|
||||
vLLM does not support dynamic-NTK ROPE. Therefore, extending long sequences for Qwen model may lead to quality degradation(even gibberish).
|
||||
|
||||
### vLLM + Transformer-like Wrapper
|
||||
|
||||
You can download the [wrapper codes](vllm_wrapper.py) and execute the following commands for multiple rounds of dialogue interaction. (Note: It currently only supports the ``model.chat()`` method.)
|
||||
|
||||
```python
|
||||
from vllm_wrapper import vLLMWrapper
|
||||
|
||||
# Bfloat16 is only supported on GPUs with compute capability of at least 8.0,
|
||||
model = vLLMWrapper('Qwen/Qwen-7B-Chat', tensor_parallel_size=1)
|
||||
|
||||
# run int4 model or GPUs with compute capability less than 8.0
|
||||
# model = vLLMWrapper('Qwen/Qwen-7B-Chat-Int4', tensor_parallel_size=1, dtype="float16")
|
||||
|
||||
response, history = model.chat(query="你好", history=None)
|
||||
print(response)
|
||||
response, history = model.chat(query="给我讲一个年轻人奋斗创业最终取得成功的故事。", history=history)
|
||||
print(response)
|
||||
response, history = model.chat(query="给这个故事起一个标题", history=history)
|
||||
print(response)
|
||||
```
|
||||
### vLLM Standalone OpenAI-like API
|
||||
|
||||
You can also deploy an OpenAI API server independently through vLLM. First, you need to download [chat template file](template_chatml.jinja).
|
||||
|
||||
Then, you can launch an OpenAI API server by following command:
|
||||
|
||||
```bash
|
||||
python -m vllm.entrypoints.openai.api_server --model $model_path --trust-remote-code --chat-template template_chatml.jinja
|
||||
|
||||
# run int4 model or GPUs with compute capability less than 8.0
|
||||
# python -m vllm.entrypoints.openai.api_server --model $model_path --trust-remote-code --dtype float16 --chat-template template_chatml.jinja
|
||||
```
|
||||
|
||||
For vLLM standalone OpenAI API server, You need to set the `stop_token_ids` parameter to `[151645]` or `stop` parameter to `["<|im_end|>"]` when invoking the server.
|
||||
|
||||
```python
|
||||
import openai
|
||||
openai.api_base = "http://localhost:8000/v1"
|
||||
openai.api_key = "none"
|
||||
|
||||
# create a request activating streaming response
|
||||
for chunk in openai.ChatCompletion.create(
|
||||
model="Qwen",
|
||||
messages=[
|
||||
{"role": "user", "content": "你好"}
|
||||
],
|
||||
stream=True,
|
||||
stop_token_ids=[151645]
|
||||
):
|
||||
if hasattr(chunk.choices[0].delta, "content"):
|
||||
print(chunk.choices[0].delta.content, end="", flush=True)
|
||||
|
||||
# create a request not activating streaming response
|
||||
response = openai.ChatCompletion.create(
|
||||
model="Qwen",
|
||||
messages=[
|
||||
{"role": "user", "content": "你好"}
|
||||
],
|
||||
stream=False,
|
||||
stop_token_ids=[151645]
|
||||
)
|
||||
print(response.choices[0].message.content)
|
||||
```
|
@ -0,0 +1,6 @@
|
||||
{% for message in messages %}
|
||||
{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}
|
||||
{{'<|im_start|>' + message['role'] + '\n' + message['content']}}
|
||||
{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}
|
||||
{% endfor %}
|
||||
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
|
@ -0,0 +1,13 @@
|
||||
# Unit testing
|
||||
- Run all unit testing
|
||||
```bash
|
||||
cd tests && pytest -s
|
||||
```
|
||||
- Run unit testing under a single folder
|
||||
```bash
|
||||
cd tests && pytest -s {dir}
|
||||
```
|
||||
- Rerun the test cases that failed in the last run
|
||||
```bash
|
||||
cd tests && pytest -s --lf
|
||||
```
|
@ -0,0 +1 @@
|
||||
[{"conversations": [{"from": "user", "value": "你好"}, {"from": "assistant", "value": "你好!很高兴为你提供帮助。"}], "id": "identity_0"}, {"conversations": [{"from": "user", "value": "给我讲一个年轻人奋斗创业最终取得成功的故事。"}, {"from": "assistant", "value": "这是一个关于一个年轻人奋斗创业最终取得成功的故事。"}], "id": "identity_1"}]
|
@ -0,0 +1,100 @@
|
||||
import os
|
||||
import sys
|
||||
import pytest
|
||||
import shutil
|
||||
from itertools import product
|
||||
import torch
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
|
||||
sys.path.append(os.path.dirname(__file__) + "/..")
|
||||
from utils import run_in_subprocess
|
||||
from ut_config import (
|
||||
MODEL_TYPE,
|
||||
DOCKER_VERSION_CU114,
|
||||
DOCKER_VERSION_CU117,
|
||||
DOCKER_VERSION_CU121,
|
||||
DOCKER_MOUNT_DIR,
|
||||
DOCKER_TEST_DIR,
|
||||
DATA_DIR,
|
||||
DS_CONFIG_ZERO2_DIR,
|
||||
DS_CONFIG_ZERO3_DIR,
|
||||
)
|
||||
|
||||
is_chat = ["chat", "base"]
|
||||
docker_version = [DOCKER_VERSION_CU114, DOCKER_VERSION_CU117, DOCKER_VERSION_CU121]
|
||||
# ZeRO3 is incompatible with LoRA when finetuning on base model.
|
||||
# FSDP or ZeRO3 are incompatible with QLoRA.
|
||||
parametrize_list_none_ds = list(
|
||||
product(*[[1], ["full", "lora"], is_chat, docker_version, [None]])
|
||||
)
|
||||
parametrize_list_ds_zero2 = list(
|
||||
product(*[[2], ["full", "lora"], is_chat, docker_version, [DS_CONFIG_ZERO2_DIR]])
|
||||
)
|
||||
parametrize_list_ds_zero3 = list(
|
||||
product(*[[2], ["full"], is_chat, docker_version, [DS_CONFIG_ZERO3_DIR]])
|
||||
) + list(product(*[[2], ["lora"], ["chat"], docker_version, [DS_CONFIG_ZERO3_DIR]]))
|
||||
parametrize_list_qlora = list(
|
||||
product(*[[1, 2], ["qlora"], ["chat"], docker_version, [None, DS_CONFIG_ZERO2_DIR]])
|
||||
)
|
||||
parametrize_list = (
|
||||
parametrize_list_none_ds
|
||||
+ parametrize_list_ds_zero2
|
||||
+ parametrize_list_ds_zero3
|
||||
+ parametrize_list_qlora
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"num_gpus,train_type,is_chat,docker_version,deepspeed", parametrize_list
|
||||
)
|
||||
def test_finetune(num_gpus, train_type, is_chat, docker_version, deepspeed):
|
||||
cmd_docker = f"docker run --gpus all --ipc=host --network=host --rm -v {os.getcwd()}/../../../Qwen:{DOCKER_MOUNT_DIR} {docker_version} /bin/bash -c "
|
||||
cmd = ""
|
||||
# for GPUs SM < 80
|
||||
is_ampere = torch.cuda.get_device_capability()[0] >= 8
|
||||
if not is_ampere:
|
||||
cmd = f"pip uninstall -y flash-attn && "
|
||||
|
||||
model_type = f"{MODEL_TYPE}-Chat" if is_chat == "chat" else MODEL_TYPE
|
||||
model_type = f"{model_type}-Int4" if train_type == "qlora" else model_type
|
||||
cmd += f"""torchrun --nproc_per_node {num_gpus} --nnodes 1 --node_rank 0 --master_addr localhost --master_port 12345 {DOCKER_MOUNT_DIR}/finetune.py \
|
||||
--model_name_or_path "{DOCKER_TEST_DIR}/{model_type}/" \
|
||||
--data_path {DATA_DIR} \
|
||||
--output_dir "{DOCKER_TEST_DIR}/output_qwen" \
|
||||
--num_train_epochs 1 \
|
||||
--per_device_train_batch_size 1 \
|
||||
--per_device_eval_batch_size 1 \
|
||||
--gradient_accumulation_steps 2 \
|
||||
--evaluation_strategy "no" \
|
||||
--save_strategy "steps" \
|
||||
--save_steps 1000 \
|
||||
--save_total_limit 10 \
|
||||
--learning_rate 1e-5 \
|
||||
--weight_decay 0.1 \
|
||||
--adam_beta2 0.95 \
|
||||
--warmup_ratio 0.01 \
|
||||
--lr_scheduler_type "cosine" \
|
||||
--logging_steps 1 \
|
||||
--report_to "none" \
|
||||
--model_max_length 512"""
|
||||
if deepspeed:
|
||||
cmd += f" --deepspeed {deepspeed}"
|
||||
if train_type == "lora":
|
||||
cmd += " --use_lora"
|
||||
elif train_type == "qlora":
|
||||
cmd += " --use_lora --q_lora"
|
||||
# for SM < 80
|
||||
if (
|
||||
(not is_ampere)
|
||||
and train_type == "lora"
|
||||
and (deepspeed and "zero2" in deepspeed)
|
||||
and is_chat == "base"
|
||||
):
|
||||
cmd += " --fp16 True"
|
||||
snapshot_download(model_type, cache_dir=".", revision="master")
|
||||
run_in_subprocess(cmd_docker + f'"{cmd}"')
|
||||
if train_type == "full":
|
||||
assert os.path.exists("output_qwen/config.json")
|
||||
else:
|
||||
assert os.path.exists("output_qwen/adapter_config.json")
|
||||
shutil.rmtree("output_qwen")
|
@ -0,0 +1,78 @@
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import pytest
|
||||
import subprocess
|
||||
import torch
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
|
||||
sys.path.append(os.path.dirname(__file__) + "/..")
|
||||
from utils import run_in_subprocess, simple_openai_api, TelnetPort
|
||||
from ut_config import (
|
||||
MODEL_TYPE,
|
||||
DOCKER_VERSION_CU114,
|
||||
DOCKER_VERSION_CU117,
|
||||
DOCKER_VERSION_CU121,
|
||||
DOCKER_MOUNT_DIR,
|
||||
DOCKER_TEST_DIR,
|
||||
)
|
||||
|
||||
|
||||
# use_cpu=True,use_int=False RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'
|
||||
# use_cpu=True,use_int4=True ValueError: Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU.You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object
|
||||
@pytest.mark.parametrize(
|
||||
"docker_version,use_cpu,use_int4",
|
||||
[
|
||||
(DOCKER_VERSION_CU114, False, False),
|
||||
(DOCKER_VERSION_CU114, False, True),
|
||||
(DOCKER_VERSION_CU117, False, False),
|
||||
(DOCKER_VERSION_CU117, False, True),
|
||||
(DOCKER_VERSION_CU121, False, False),
|
||||
(DOCKER_VERSION_CU121, False, True),
|
||||
],
|
||||
)
|
||||
def test_inference_api(docker_version, use_cpu, use_int4):
|
||||
container_name = "test_inference_api"
|
||||
model_type = f"{MODEL_TYPE}-Chat-Int4" if use_int4 else f"{MODEL_TYPE}-Chat"
|
||||
cmd_docker = f'docker run --gpus all --ipc=host --network=host --rm --name="{container_name}" -p 8000:8000 -v {os.getcwd()}/../../../Qwen:{DOCKER_MOUNT_DIR} {docker_version} /bin/bash -c '
|
||||
cmd = ""
|
||||
# for GPUs SM < 80
|
||||
is_ampere = torch.cuda.get_device_capability()[0] >= 8
|
||||
if not is_ampere:
|
||||
cmd += f"pip uninstall -y flash-attn && "
|
||||
|
||||
cmd += f"""python {DOCKER_MOUNT_DIR}/openai_api.py -c {DOCKER_TEST_DIR}/{model_type}"""
|
||||
|
||||
if use_cpu:
|
||||
cmd += " --cpu-only"
|
||||
|
||||
snapshot_download(model_type, cache_dir=".", revision="master")
|
||||
# start model server
|
||||
print(cmd_docker + f'"{cmd}"')
|
||||
run_in_subprocess(
|
||||
f'docker rm -f {container_name} 2>/dev/null || echo "The container does not exist."'
|
||||
)
|
||||
run_in_subprocess("nohup " + cmd_docker + f'"{cmd}"' + " > tmp.log 2>&1 &")
|
||||
|
||||
while not TelnetPort("localhost", 8000):
|
||||
print("Wait for the model service start.")
|
||||
time.sleep(0.5)
|
||||
|
||||
if (
|
||||
subprocess.run(
|
||||
f"docker inspect {container_name}",
|
||||
shell=True,
|
||||
stdout=subprocess.DEVNULL,
|
||||
).returncode
|
||||
!= 0
|
||||
):
|
||||
break
|
||||
try:
|
||||
# while load int4 model such as Qwen-1_8B-Chat-Int4, the model name is Qwen-1_8B-Chat
|
||||
simple_openai_api(f"{MODEL_TYPE}-Chat".split("/")[-1])
|
||||
except Exception as e:
|
||||
time.sleep(1)
|
||||
with open("tmp.log") as f:
|
||||
raise Exception(f"{e} \n {f.read()}")
|
||||
|
||||
run_in_subprocess(f"docker rm -f {container_name}")
|
@ -0,0 +1,73 @@
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import pytest
|
||||
import subprocess
|
||||
import torch
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
|
||||
sys.path.append(os.path.dirname(__file__) + "/..")
|
||||
from utils import run_in_subprocess, simple_openai_api, TelnetPort
|
||||
from ut_config import (
|
||||
MODEL_TYPE,
|
||||
DOCKER_VERSION_CU121,
|
||||
DOCKER_MOUNT_DIR,
|
||||
DOCKER_TEST_DIR,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"num_gpus,use_int4",
|
||||
[
|
||||
(1, False),
|
||||
(1, True),
|
||||
(2, False),
|
||||
# ValueError: The input size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size.
|
||||
# (2, True)
|
||||
],
|
||||
)
|
||||
def test_inference_vllm_fschat(num_gpus, use_int4):
|
||||
model_type = f"{MODEL_TYPE}-Chat-Int4" if use_int4 else f"{MODEL_TYPE}-Chat"
|
||||
container_name = "test_inference_vllm_fschat"
|
||||
cmd_docker = f'docker run --gpus all --ipc=host --network=host --rm --name="{container_name}" -p 8000:8000 -v {os.getcwd()}/../../../Qwen:{DOCKER_MOUNT_DIR} {DOCKER_VERSION_CU121} /bin/bash -c '
|
||||
cmd = ""
|
||||
|
||||
cmd += f"""nohup python -m fastchat.serve.controller > /dev/null 2>&1 \
|
||||
& python -m fastchat.serve.openai_api_server --host localhost --port 8000 > /dev/null 2>&1 \
|
||||
& python -m fastchat.serve.vllm_worker --model-path {DOCKER_TEST_DIR}/{model_type} --tensor-parallel-size {num_gpus} --trust-remote-code"""
|
||||
|
||||
# for GPUS SM < 80 and use_int==True
|
||||
is_ampere = torch.cuda.get_device_capability()[0] >= 8
|
||||
if not is_ampere or use_int4:
|
||||
cmd += " --dtype half"
|
||||
|
||||
snapshot_download(model_type, cache_dir=".", revision="master")
|
||||
# start model server
|
||||
run_in_subprocess(
|
||||
f'docker rm -f {container_name} 2>/dev/null || echo "The container does not exist."'
|
||||
)
|
||||
print(cmd_docker + f'"{cmd}"')
|
||||
run_in_subprocess("nohup " + cmd_docker + f'"{cmd}"' + " > tmp.log 2>&1 &")
|
||||
|
||||
while not TelnetPort("localhost", 21002):
|
||||
print("Wait for the model service start.")
|
||||
time.sleep(0.5)
|
||||
|
||||
if (
|
||||
subprocess.run(
|
||||
f"docker inspect {container_name}",
|
||||
shell=True,
|
||||
stdout=subprocess.DEVNULL,
|
||||
).returncode
|
||||
!= 0
|
||||
):
|
||||
break
|
||||
|
||||
try:
|
||||
simple_openai_api(model_type.split("/")[-1])
|
||||
except Exception as e:
|
||||
time.sleep(1)
|
||||
with open("tmp.log") as f:
|
||||
raise Exception(f"{e} \n {f.read()}")
|
||||
|
||||
run_in_subprocess(f"docker rm -f {container_name}")
|
@ -0,0 +1,18 @@
|
||||
import os
|
||||
|
||||
# common
|
||||
MODEL_TYPE = "Qwen/Qwen-1_8B"
|
||||
DOCKER_VERSION_CU114 = "qwenllm/qwen:cu114"
|
||||
DOCKER_VERSION_CU117 = "qwenllm/qwen:cu117"
|
||||
DOCKER_VERSION_CU121 = "qwenllm/qwen:cu121"
|
||||
DOCKER_MOUNT_DIR = "/qwen-recipes"
|
||||
DOCKER_TEST_DIR = os.path.join(DOCKER_MOUNT_DIR, "recipes/tests")
|
||||
|
||||
# finetune
|
||||
DATA_DIR = os.path.join(DOCKER_MOUNT_DIR, "recipes/tests/assets/test_sampled_qwen.json")
|
||||
DS_CONFIG_ZERO2_DIR = os.path.join(
|
||||
DOCKER_MOUNT_DIR, "finetune/ds_config_zero2.json"
|
||||
)
|
||||
DS_CONFIG_ZERO3_DIR = os.path.join(
|
||||
DOCKER_MOUNT_DIR, "finetune/ds_config_zero3.json"
|
||||
)
|
@ -0,0 +1,61 @@
|
||||
import logging
|
||||
import subprocess
|
||||
import socket
|
||||
import openai
|
||||
|
||||
|
||||
def run_in_subprocess(cmd):
|
||||
try:
|
||||
with subprocess.Popen(
|
||||
cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
) as return_info:
|
||||
while True:
|
||||
next_line = return_info.stdout.readline()
|
||||
return_line = next_line.decode("utf-8", "ignore").strip()
|
||||
if return_line == "" and return_info.poll() != None:
|
||||
break
|
||||
if return_line != "":
|
||||
logging.info(return_line)
|
||||
|
||||
err_lines = ""
|
||||
while True:
|
||||
next_line = return_info.stderr.readline()
|
||||
return_line = next_line.decode("utf-8", "ignore").strip()
|
||||
if return_line == "" and return_info.poll() != None:
|
||||
break
|
||||
if return_line != "":
|
||||
logging.info(return_line)
|
||||
err_lines += return_line + "\n"
|
||||
|
||||
return_code = return_info.wait()
|
||||
if return_code:
|
||||
raise RuntimeError(err_lines)
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
|
||||
def simple_openai_api(model):
|
||||
openai.api_base = "http://localhost:8000/v1"
|
||||
openai.api_key = "none"
|
||||
|
||||
# create a request not activating streaming response
|
||||
response = openai.ChatCompletion.create(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "你好"}],
|
||||
stream=False,
|
||||
stop=[], # You can add custom stop words here, e.g., stop=["Observation:"] for ReAct prompting.
|
||||
)
|
||||
print(response.choices[0].message.content)
|
||||
|
||||
|
||||
def TelnetPort(server_ip, port):
|
||||
sk = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
sk.settimeout(1)
|
||||
connect_flag = False
|
||||
try:
|
||||
sk.connect((server_ip, port))
|
||||
connect_flag = True
|
||||
except Exception:
|
||||
connect_flag = False
|
||||
sk.close()
|
||||
return connect_flag
|
Loading…
Reference in New Issue