Isekai-Qwen/finetune/finetune_lora_ds.sh

#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
DIR=`pwd`

GPUS_PER_NODE=8
NNODES=1
NODE_RANK=0
MASTER_ADDR=localhost
MASTER_PORT=6001

MODEL="Qwen/Qwen-7B" # Set the path if you do not want to load from huggingface directly
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
# See the section for finetuning in README for more information.
DATA="path_to_data"

DISTRIBUTED_ARGS="
    --nproc_per_node $GPUS_PER_NODE \
    --nnodes $NNODES \
    --node_rank $NODE_RANK \
    --master_addr $MASTER_ADDR \
    --master_port $MASTER_PORT
"

torchrun $DISTRIBUTED_ARGS finetune.py \
    --model_name_or_path $MODEL \
    --data_path $DATA \
    --bf16 True \
    --output_dir output_qwen \
    --num_train_epochs 5 \
    --per_device_train_batch_size 2 \
    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 8 \
    --evaluation_strategy "no" \
    --save_strategy "steps" \
    --save_steps 1000 \
    --save_total_limit 10 \
    --learning_rate 3e-4 \
    --weight_decay 0.1 \
    --adam_beta2 0.95 \
    --warmup_ratio 0.01 \
    --lr_scheduler_type "cosine" \
    --logging_steps 1 \
    --report_to "none" \
    --model_max_length 512 \
    --lazy_preprocess True \
    --use_lora \
    --gradient_checkpointing \
    --deepspeed finetune/ds_config_zero2.json
add finetuning 1 year ago			`#!/bin/bash`
			`export CUDA_DEVICE_MAX_CONNECTIONS=1`
			DIR=`pwd`

			`GPUS_PER_NODE=8`
			`NNODES=1`
			`NODE_RANK=0`
			`MASTER_ADDR=localhost`
			`MASTER_PORT=6001`

			`MODEL="Qwen/Qwen-7B" # Set the path if you do not want to load from huggingface directly`
			`# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.`
			`# See the section for finetuning in README for more information.`
			`DATA="path_to_data"`

			`DISTRIBUTED_ARGS="`
			`--nproc_per_node $GPUS_PER_NODE \`
			`--nnodes $NNODES \`
			`--node_rank $NODE_RANK \`
			`--master_addr $MASTER_ADDR \`
			`--master_port $MASTER_PORT`
			`"`

			`torchrun $DISTRIBUTED_ARGS finetune.py \`
			`--model_name_or_path $MODEL \`
			`--data_path $DATA \`
			`--bf16 True \`
			`--output_dir output_qwen \`
			`--num_train_epochs 5 \`
			`--per_device_train_batch_size 2 \`
			`--per_device_eval_batch_size 1 \`
			`--gradient_accumulation_steps 8 \`
			`--evaluation_strategy "no" \`
			`--save_strategy "steps" \`
			`--save_steps 1000 \`
			`--save_total_limit 10 \`
Update finetune_lora_ds.sh 1 year ago			`--learning_rate 3e-4 \`
add finetuning 1 year ago			`--weight_decay 0.1 \`
			`--adam_beta2 0.95 \`
			`--warmup_ratio 0.01 \`
			`--lr_scheduler_type "cosine" \`
			`--logging_steps 1 \`
			`--report_to "none" \`
fix single-gpu qlora, and add profiling 1 year ago			`--model_max_length 512 \`
add finetuning 1 year ago			`--lazy_preprocess True \`
			`--use_lora \`
add gradient checkpointing 1 year ago			`--gradient_checkpointing \`
Update finetune_lora_ds.sh 1 year ago			`--deepspeed finetune/ds_config_zero2.json`