From 14a6e8b54cd7fce687f67606efd0eb40a9916c8c Mon Sep 17 00:00:00 2001 From: sunyi001 <1659275352@qq.com> Date: Tue, 20 May 2025 17:05:40 +0800 Subject: [PATCH] add training script for qwen2_5_vl_32b --- .../built-in/rl/VeRL_for_PyTorch/README.md | 51 ++++-- .../train_qwen2_5_vl_32b_GRPO_full_32p.sh | 154 ++++++++++++++++++ ...ain_qwen2_5_vl_32b_GRPO_performance_32p.sh | 154 ++++++++++++++++++ 3 files changed, 342 insertions(+), 17 deletions(-) create mode 100644 PyTorch/built-in/rl/VeRL_for_PyTorch/test/train_qwen2_5_vl_32b_GRPO_full_32p.sh create mode 100644 PyTorch/built-in/rl/VeRL_for_PyTorch/test/train_qwen2_5_vl_32b_GRPO_performance_32p.sh diff --git a/PyTorch/built-in/rl/VeRL_for_PyTorch/README.md b/PyTorch/built-in/rl/VeRL_for_PyTorch/README.md index fc58b5cdf1..0ed8d871a4 100644 --- a/PyTorch/built-in/rl/VeRL_for_PyTorch/README.md +++ b/PyTorch/built-in/rl/VeRL_for_PyTorch/README.md @@ -146,7 +146,7 @@ verl‌是一个集SFT(监督学习)与RL(强化学习)于一体的灵 ## 获取预训练模型 - 用户自行下载`Qwen2.5-VL-7B-Instruct`、`Qwen2.5-VL-3B-Instruct`、`Qwen2.5-7B-Instruct`和`Qwen2.5-32B-Instruct`模型。 + 用户自行下载`Qwen2.5-VL-7B-Instruct`、`Qwen2.5-VL-3B-Instruct`、`Qwen2.5-VL-32B-Instruct`、`Qwen2.5-7B-Instruct`和`Qwen2.5-32B-Instruct`模型。 # 开始训练 @@ -164,28 +164,17 @@ verl‌是一个集SFT(监督学习)与RL(强化学习)于一体的灵 1. 主从节点保证模型和数据集路径完全相同。 - 2. 主从节点分别执行以下命令获取节点ip对应的网口名称: - ```shell - ifconfig - ``` - - 3. 主从节点分别设置以下环境变量: - ```shell - export GLOO_SOCKET_IFNAME=网口名称 - export NCCL_SOCKET_IFNAME=网口名称 - ``` - - 4. 主节点执行以下命令启动ray集群: + 2. 主节点执行以下命令启动ray集群: ```shell ray start --head ``` - 5. 从节点执行以下命令加入ray集群: + 3. 从节点执行以下命令加入ray集群: ```shell ray start --address='主节点ip:6379' ``` - 6. 从节点执行以下命令确认双机已互联: + 4. 从节点执行以下命令确认双机已互联: ```shell ray status ``` @@ -220,6 +209,22 @@ verl‌是一个集SFT(监督学习)与RL(强化学习)于一体的灵 bash test/train_qwen2_5_vl_7b_GRPO_performance_16p.sh --data_path=xxx --model_path=xxx # 16卡性能 ``` + `Qwen2.5-VL-32B-Instruct`模型支持双机32卡训练。 + + - 双机32卡训练 + + ```shell + # 主节点执行 + bash test/train_qwen2_5_vl_32b_GRPO_full_32p.sh --data_path=xxx --model_path=xxx # 32卡训练 + ``` + + - 双机32卡性能 + + ```shell + # 主节点执行 + bash test/train_qwen2_5_vl_32b_GRPO_performance_32p.sh --data_path=xxx --model_path=xxx # 32卡性能 + ``` + `Qwen2.5-7B-Instruct`模型支持单机16卡训练。 - 单机16卡训练 @@ -279,5 +284,17 @@ verl‌是一个集SFT(监督学习)与RL(强化学习)于一体的灵 ## FAQ -无。 - +- 如果在训练过程中遇到`RuntimeError: Gloo connectFullMesh failed`错误,请按照以下步骤操作: + + - 在主从节点分别执行以下命令获取节点ip对应的网口名称: + + ```shell + ifconfig + ``` + + - 在主从节点分别设置以下环境变量: + + ```shell + export GLOO_SOCKET_IFNAME=网口名称 + export HCCL_SOCKET_IFNAME=网口名称 + ``` diff --git a/PyTorch/built-in/rl/VeRL_for_PyTorch/test/train_qwen2_5_vl_32b_GRPO_full_32p.sh b/PyTorch/built-in/rl/VeRL_for_PyTorch/test/train_qwen2_5_vl_32b_GRPO_full_32p.sh new file mode 100644 index 0000000000..d9df6cd064 --- /dev/null +++ b/PyTorch/built-in/rl/VeRL_for_PyTorch/test/train_qwen2_5_vl_32b_GRPO_full_32p.sh @@ -0,0 +1,154 @@ +#!/bin/bash + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + +# 数据集路径,保持为空,不需要修改 +data_path="" +model_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Qwen2_5_vl_32b_for_PyTorch" + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./test/train_qwen2_5_vl_32b_GRPO_full_32p.sh " + echo " " + echo "parameter explain: + --data_path source data of training + --model_path model path for GRPO + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --model_path* ]];then + model_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi +if [[ $model_path == "" ]];then + echo "[Error] para \"model_path\" must be confing" + exit 1 +fi + +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path + +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output + mkdir -p ${test_path_dir}/output +else + mkdir -p ${test_path_dir}/output +fi + +ENGINE=vllm + +nohup python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=grpo \ + data.train_files=$data_path/train.parquet \ + data.val_files=$data_path/test.parquet \ + data.train_batch_size=512 \ + data.max_prompt_length=1024 \ + data.max_response_length=2048 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.image_key=images \ + actor_rollout_ref.model.path=$model_path \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=32 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.01 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.use_torch_compile=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=8 \ + actor_rollout_ref.rollout.name=$ENGINE \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \ + actor_rollout_ref.rollout.enable_chunked_prefill=False \ + actor_rollout_ref.rollout.enforce_eager=False \ + actor_rollout_ref.rollout.free_cache_engine=False \ + actor_rollout_ref.rollout.n=5 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger=['console'] \ + trainer.project_name='verl_grpo_example_geo3k' \ + trainer.experiment_name='qwen2_5_vl_32b_function_rm' \ + trainer.n_gpus_per_node=16 \ + trainer.nnodes=2 \ + trainer.save_freq=-1 \ + trainer.test_freq=-1 \ + trainer.total_epochs=15 > ${test_path_dir}/output/train_verl_qwen2_5_vl_32b.log 2>&1 & +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep 'perf/throughput:' $test_path_dir/output/train_verl_qwen2_5_vl_32b.log | awk -F 'perf/throughput:' '{print$2}' | awk -F ' ' '{print$1}' | head -n 4 | awk '{sum+=$1} END {print"",sum/NR}'` + +#排除功能问题导致计算溢出的异常,增加健壮性 +if [ x"${FPS}" == x"2147483647" ] || [ x"${FPS}" == x"-2147483647" ];then + FPS="" +fi +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#打印,不需要修改 +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +DeviceType=`uname -m` +CaseName=${Network}_'32p'_'full' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $test_path_dir/output/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $test_path_dir/output/${CaseName}.log +echo "CaseName = ${CaseName}" >> $test_path_dir/output/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $test_path_dir/output/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $test_path_dir/output/${CaseName}.log diff --git a/PyTorch/built-in/rl/VeRL_for_PyTorch/test/train_qwen2_5_vl_32b_GRPO_performance_32p.sh b/PyTorch/built-in/rl/VeRL_for_PyTorch/test/train_qwen2_5_vl_32b_GRPO_performance_32p.sh new file mode 100644 index 0000000000..0ec448e5d1 --- /dev/null +++ b/PyTorch/built-in/rl/VeRL_for_PyTorch/test/train_qwen2_5_vl_32b_GRPO_performance_32p.sh @@ -0,0 +1,154 @@ +#!/bin/bash + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + +# 数据集路径,保持为空,不需要修改 +data_path="" +model_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Qwen2_5_vl_32b_for_PyTorch" + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./test/train_qwen2_5_vl_32b_GRPO_performance_32p.sh " + echo " " + echo "parameter explain: + --data_path source data of training + --model_path model path for GRPO + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --model_path* ]];then + model_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi +if [[ $model_path == "" ]];then + echo "[Error] para \"model_path\" must be confing" + exit 1 +fi + +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path + +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output + mkdir -p ${test_path_dir}/output +else + mkdir -p ${test_path_dir}/output +fi + +ENGINE=vllm + +nohup python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=grpo \ + data.train_files=$data_path/train.parquet \ + data.val_files=$data_path/test.parquet \ + data.train_batch_size=512 \ + data.max_prompt_length=1024 \ + data.max_response_length=2048 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.image_key=images \ + actor_rollout_ref.model.path=$model_path \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=32 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.01 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.use_torch_compile=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=8 \ + actor_rollout_ref.rollout.name=$ENGINE \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \ + actor_rollout_ref.rollout.enable_chunked_prefill=False \ + actor_rollout_ref.rollout.enforce_eager=False \ + actor_rollout_ref.rollout.free_cache_engine=False \ + actor_rollout_ref.rollout.n=5 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger=['console'] \ + trainer.project_name='verl_grpo_example_geo3k' \ + trainer.experiment_name='qwen2_5_vl_32b_function_rm' \ + trainer.n_gpus_per_node=16 \ + trainer.nnodes=2 \ + trainer.save_freq=-1 \ + trainer.test_freq=-1 \ + trainer.total_epochs=1 > ${test_path_dir}/output/train_verl_qwen2_5_vl_32b.log 2>&1 & +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep 'perf/throughput:' $test_path_dir/output/train_verl_qwen2_5_vl_32b.log | awk -F 'perf/throughput:' '{print$2}' | awk -F ' ' '{print$1}' | head -n 4 | awk '{sum+=$1} END {print"",sum/NR}'` + +#排除功能问题导致计算溢出的异常,增加健壮性 +if [ x"${FPS}" == x"2147483647" ] || [ x"${FPS}" == x"-2147483647" ];then + FPS="" +fi +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#打印,不需要修改 +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +DeviceType=`uname -m` +CaseName=${Network}_'32p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $test_path_dir/output/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $test_path_dir/output/${CaseName}.log +echo "CaseName = ${CaseName}" >> $test_path_dir/output/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $test_path_dir/output/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $test_path_dir/output/${CaseName}.log -- Gitee