From d0d13c51e1c74359c3a1cf9679d3dd568f894993 Mon Sep 17 00:00:00 2001 From: bailang Date: Thu, 31 Mar 2022 17:52:41 +0800 Subject: [PATCH] =?UTF-8?q?[=E4=BC=97=E6=99=BA][Pytorch]=E6=A8=A1=E5=9E=8B?= =?UTF-8?q?=E8=AE=AD=E7=BB=83=E8=84=9A=E6=9C=AC=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: bailang --- .../contrib/audio/WaveGlow/requirement.txt | 2 +- .../InceptionV3_ID1596_for_PyTorch/README.md | 8 +-- .../InceptionV3_ID1596_for_PyTorch/main.py | 4 +- .../test/env.sh | 66 ------------------- .../test/train_eval_8p.sh | 7 +- .../test/train_full_1p.sh | 9 ++- .../test/train_full_8p.sh | 9 ++- .../test/train_performance_1p.sh | 9 ++- .../test/train_performance_8p.sh | 9 ++- .../cv/detection/SOLOv2/test/train_eval_1p.sh | 2 +- .../SOLOv2/test/train_finetune_1p.sh | 2 +- .../cv/detection/SOLOv2/test/train_full_1p.sh | 4 +- .../cv/detection/SOLOv2/test/train_full_8p.sh | 2 +- .../SOLOv2/test/train_performance_1p.sh | 4 +- .../SOLOv2/test/train_performance_8p.sh | 2 +- .../test/train_full_8p.sh | 14 +++- .../test/train_performance_8p.sh | 14 +++- .../Lightweight_OpenPose/train.py | 5 +- 18 files changed, 74 insertions(+), 98 deletions(-) delete mode 100644 PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/env.sh diff --git a/PyTorch/contrib/audio/WaveGlow/requirement.txt b/PyTorch/contrib/audio/WaveGlow/requirement.txt index e85be42a1f..45e6410de9 100644 --- a/PyTorch/contrib/audio/WaveGlow/requirement.txt +++ b/PyTorch/contrib/audio/WaveGlow/requirement.txt @@ -1,6 +1,6 @@ matplotlib==3.5.1 numpy==1.20.3 -inflect==5.3.1 +inflect==5.3.0 scipy==1.7.3 Unidecode==1.3.2 Pillow==8.4.0 diff --git a/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/README.md b/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/README.md index 3f642b3e1c..2c227916b0 100644 --- a/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/README.md +++ b/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/README.md @@ -21,16 +21,16 @@ To train a model, run `main.py`or `main-8p.py` with the desired model architectu # 1p prefomance training 1p -bash test/train_performance_1p.sh +bash test/train_performance_1p.sh --data_path=/data/imagenet # 8p prefomance training 8p -bash test/train_performance_8p.sh +bash test/train_performance_8p.sh --data_path=/data/imagenet # 1p full training 1p -bash test/train_performance_1p.sh +bash test/train_full_1p.sh --data_path=/data/imagenet # 8p full training 8p -bash test/train_performance_8p.sh +bash test/train_full_8p.sh --data_path=/data/imagenet # online inference demo python3.7.5 demo.py diff --git a/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/main.py b/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/main.py index fc04e15082..fad9dc92f3 100644 --- a/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/main.py +++ b/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/main.py @@ -435,8 +435,8 @@ def train(train_loader, model, criterion, optimizer, epoch, args): if i % args.print_freq == 0: progress.display(i) - if i > 100: - pass + if i < 10: + batch_time.reset() print("batch_size:", args.batch_size, 'Time: {:.3f}'.format(batch_time.avg), '* FPS@all {:.3f}'.format( args.batch_size/batch_time.avg)) diff --git a/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/env.sh b/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/env.sh deleted file mode 100644 index 65547960b3..0000000000 --- a/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/env.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash -export install_path=/usr/local/Ascend - -if [ -d ${install_path}/toolkit ]; then - export LD_LIBRARY_PATH=/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH} - export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH - export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH - export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH - export ASCEND_OPP_PATH=${install_path}/opp -else - if [ -d ${install_path}/nnae/latest ];then - export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/nnae/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH - export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/ - export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/ - export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so - export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH - export ASCEND_AICPU_PATH=${install_path}/nnae/latest - else - export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH - export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/ - export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/ - export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so - export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH - export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest - fi -fi - - -#将Host日志输出到串口,0-关闭/1-开启 -export ASCEND_SLOG_PRINT_TO_STDOUT=0 -#设置默认日志级别,0-debug/1-info/2-warning/3-error -export ASCEND_GLOBAL_LOG_LEVEL=3 -#设置Event日志开启标志,0-关闭/1-开启 -export ASCEND_GLOBAL_EVENT_ENABLE=0 -#设置是否开启taskque,0-关闭/1-开启 -export TASK_QUEUE_ENABLE=1 -#设置是否开启PTCopy,0-关闭/1-开启 -export PTCOPY_ENABLE=1 -#设置是否开启combined标志,0-关闭/1-开启 -export COMBINED_ENABLE=1 -#设置特殊场景是否需要重新编译,不需要修改 -export DYNAMIC_OP="ADD#MUL" -#HCCL白名单开关,1-关闭/0-开启 -export HCCL_WHITELIST_DISABLE=1 -export HCCL_IF_IP=$(hostname -I |awk '{print $1}') - -path_lib=$(python3.7 -c """ -import sys -import re -result='' -for index in range(len(sys.path)): - match_sit = re.search('-packages', sys.path[index]) - if match_sit is not None: - match_lib = re.search('lib', sys.path[index]) - - if match_lib is not None: - end=match_lib.span()[1] - result += sys.path[index][0:end] + ':' - - result+=sys.path[index] + '/torch/lib:' -print(result)""" -) - -echo ${path_lib} - -export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH diff --git a/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_eval_8p.sh b/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_eval_8p.sh index b4836dc864..ecd22e14dc 100644 --- a/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_eval_8p.sh +++ b/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_eval_8p.sh @@ -61,7 +61,12 @@ fi #################启动训练脚本################# # 训练开始时间,不需要修改 start_time=$(date +%s) -# source 环境变量 +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi python3 ./main-8p.py \ -a inception_v3 \ diff --git a/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_full_1p.sh b/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_full_1p.sh index ec96dd722a..f802accfeb 100644 --- a/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_full_1p.sh +++ b/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_full_1p.sh @@ -71,7 +71,12 @@ fi #################启动训练脚本################# # 训练开始时间,不需要修改 start_time=$(date +%s) -# source 环境变量 +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi python3 ./main.py \ --data ${data_path} \ @@ -96,7 +101,7 @@ e2e_time=$(( $end_time - $start_time )) # 结果打印,不需要修改 echo "------------------ Final result ------------------" # 输出性能FPS,需要模型审视修改 -FPS=`grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "FPS" '{print $2}'|awk -F " " '{print $1}' | tail -n +2|awk '{sum+=$1} END {print sum/NR}' | sed s/[[]:space:]//g ` +FPS=`grep "FPS@all" ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $7}' | tail -1` # 打印,不需要修改 echo "Final Performance images/sec : $FPS" diff --git a/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_full_8p.sh b/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_full_8p.sh index 477837af43..e80116c341 100644 --- a/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_full_8p.sh @@ -59,7 +59,12 @@ fi #################启动训练脚本################# # 训练开始时间,不需要修改 start_time=$(date +%s) -# source 环境变量 +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi python3 ./main-8p.py \ -a inception_v3 \ @@ -93,7 +98,7 @@ e2e_time=$(( $end_time - $start_time )) # 结果打印,不需要修改 echo "------------------ Final result ------------------" # 输出性能FPS,需要模型审视修改 -FPS=`grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "FPS" '{print $2}'|awk -F " " '{print $1}' | tail -n +2|awk '{sum+=$1} END {print sum/NR}' | sed s/[[]:space:]//g ` +FPS=`grep "FPS@all" ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $7}' | tail -1` # 打印,不需要修改 echo "Final Performance images/sec : $FPS" diff --git a/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_performance_1p.sh index d77bd8658a..3a9175ab77 100644 --- a/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_performance_1p.sh @@ -70,7 +70,12 @@ fi #################启动训练脚本################# # 训练开始时间,不需要修改 start_time=$(date +%s) -# source 环境变量 +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi #参数修改 sed -i "s|pass|break|g" main.py wait @@ -103,7 +108,7 @@ wait # 结果打印,不需要修改 echo "------------------ Final result ------------------" # 输出性能FPS,需要模型审视修改 -FPS=`grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "FPS" '{print $2}'|awk -F " " '{print $1}' | tail -n +2|awk '{sum+=$1} END {print sum/NR}' | sed s/[[]:space:]//g ` +FPS=`grep "FPS@all" ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $7}' | tail -1` # 打印,不需要修改 echo "Final Performance images/sec : $FPS" diff --git a/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_performance_8p.sh b/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_performance_8p.sh index 8930aee190..3d1080d971 100644 --- a/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_performance_8p.sh +++ b/PyTorch/contrib/cv/classification/InceptionV3_ID1596_for_PyTorch/test/train_performance_8p.sh @@ -59,7 +59,12 @@ fi #################启动训练脚本################# # 训练开始时间,不需要修改 start_time=$(date +%s) -# source 环境变量 +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi python3 ./main-8p.py \ -a inception_v3 \ @@ -91,7 +96,7 @@ e2e_time=$(( $end_time - $start_time )) # 结果打印,不需要修改 echo "------------------ Final result ------------------" # 输出性能FPS,需要模型审视修改 -FPS=`grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "FPS" '{print $2}'|awk -F " " '{print $1}' | tail -n +2|awk '{sum+=$1} END {print sum/NR}' | sed s/[[]:space:]//g ` +FPS=`grep "FPS@all" ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $7}' | tail -1` # 打印,不需要修改 echo "Final Performance images/sec : $FPS" diff --git a/PyTorch/contrib/cv/detection/SOLOv2/test/train_eval_1p.sh b/PyTorch/contrib/cv/detection/SOLOv2/test/train_eval_1p.sh index 338b8dd79b..8132b286e3 100644 --- a/PyTorch/contrib/cv/detection/SOLOv2/test/train_eval_1p.sh +++ b/PyTorch/contrib/cv/detection/SOLOv2/test/train_eval_1p.sh @@ -60,7 +60,7 @@ etp_flag=`echo ${check_etp_flag#*=}` if [ x"${etp_flag}" != x"true" ];then source ${cur_path}/test/env_npu.sh fi -nohup python tools/test_ins.py configs/solov2/solov2_r50_fpn_8gpu_1x.py $MODEL --show --out results_solo.pkl --eval segm \ +nohup python3.7 tools/test_ins.py configs/solov2/solov2_r50_fpn_8gpu_1x.py $MODEL --show --out results_solo.pkl --eval segm \ --data_root=$data_path > ${cur_path}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log 2>&1 & wait diff --git a/PyTorch/contrib/cv/detection/SOLOv2/test/train_finetune_1p.sh b/PyTorch/contrib/cv/detection/SOLOv2/test/train_finetune_1p.sh index dd5f93484f..84385cae6f 100644 --- a/PyTorch/contrib/cv/detection/SOLOv2/test/train_finetune_1p.sh +++ b/PyTorch/contrib/cv/detection/SOLOv2/test/train_finetune_1p.sh @@ -67,7 +67,7 @@ fi #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 export NPUID=0 export RANK=0 -python tools/train.py configs/solov2/solov2_r50_fpn_8gpu_1x.py --opt-level $apex --autoscale-lr --seed 0 --total_epochs 1 \ +python3.7 tools/train.py configs/solov2/solov2_r50_fpn_8gpu_1x.py --opt-level $apex --autoscale-lr --seed 0 --total_epochs 1 \ --data_root=$data_path --gpu-ids 0 --resume_from work_dirs/solov2_release_r50_fpn_8gpu_1x/latest.pth \ --fine-tune > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & wait diff --git a/PyTorch/contrib/cv/detection/SOLOv2/test/train_full_1p.sh b/PyTorch/contrib/cv/detection/SOLOv2/test/train_full_1p.sh index b44b9a5126..b4be544856 100644 --- a/PyTorch/contrib/cv/detection/SOLOv2/test/train_full_1p.sh +++ b/PyTorch/contrib/cv/detection/SOLOv2/test/train_full_1p.sh @@ -67,10 +67,10 @@ fi #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 export NPUID=0 export RANK=0 -python tools/train.py configs/solov2/solov2_r50_fpn_8gpu_1x.py --opt-level $apex --autoscale-lr --seed 0 --total_epochs 12 \ +python3.7 tools/train.py configs/solov2/solov2_r50_fpn_8gpu_1x.py --opt-level $apex --autoscale-lr --seed 0 --total_epochs 12 \ --data_root=$data_path --gpu-ids 0 > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & wait -python tools/test_ins.py configs/solov2/solov2_r50_fpn_8gpu_1x.py work_dirs/solov2_release_r50_fpn_8gpu_1x/latest.pth --show \ +python3.7 tools/test_ins.py configs/solov2/solov2_r50_fpn_8gpu_1x.py work_dirs/solov2_release_r50_fpn_8gpu_1x/latest.pth --show \ --out results_solo.pkl --eval segm --data_root=$data_path >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & wait diff --git a/PyTorch/contrib/cv/detection/SOLOv2/test/train_full_8p.sh b/PyTorch/contrib/cv/detection/SOLOv2/test/train_full_8p.sh index 08dbdd7141..00f40316f4 100644 --- a/PyTorch/contrib/cv/detection/SOLOv2/test/train_full_8p.sh +++ b/PyTorch/contrib/cv/detection/SOLOv2/test/train_full_8p.sh @@ -96,7 +96,7 @@ do fi done wait -python tools/test_ins.py configs/solov2/solov2_r50_fpn_8gpu_1x.py work_dirs/solov2_release_r50_fpn_8gpu_1x/latest.pth --show \ +python3.7 tools/test_ins.py configs/solov2/solov2_r50_fpn_8gpu_1x.py work_dirs/solov2_release_r50_fpn_8gpu_1x/latest.pth --show \ --out results_solo.pkl --eval segm --data_root=$data_path >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & wait diff --git a/PyTorch/contrib/cv/detection/SOLOv2/test/train_performance_1p.sh b/PyTorch/contrib/cv/detection/SOLOv2/test/train_performance_1p.sh index 04d54a8554..9cb99d7d0b 100644 --- a/PyTorch/contrib/cv/detection/SOLOv2/test/train_performance_1p.sh +++ b/PyTorch/contrib/cv/detection/SOLOv2/test/train_performance_1p.sh @@ -67,10 +67,10 @@ fi #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 export NPUID=0 export RANK=0 -python tools/train.py configs/solov2/solov2_r50_fpn_8gpu_1x.py --opt-level $apex --autoscale-lr --seed 0 --total_epochs 1 \ +python3.7 tools/train.py configs/solov2/solov2_r50_fpn_8gpu_1x.py --opt-level $apex --autoscale-lr --seed 0 --total_epochs 1 \ --data_root=$data_path --gpu-ids 0 > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & wait -python tools/test_ins.py configs/solov2/solov2_r50_fpn_8gpu_1x.py work_dirs/solov2_release_r50_fpn_8gpu_1x/latest.pth --show \ +python3.7 tools/test_ins.py configs/solov2/solov2_r50_fpn_8gpu_1x.py work_dirs/solov2_release_r50_fpn_8gpu_1x/latest.pth --show \ --out results_solo.pkl --eval segm --data_root=$data_path >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & wait diff --git a/PyTorch/contrib/cv/detection/SOLOv2/test/train_performance_8p.sh b/PyTorch/contrib/cv/detection/SOLOv2/test/train_performance_8p.sh index 83aa6d37c4..8e00d0e5da 100644 --- a/PyTorch/contrib/cv/detection/SOLOv2/test/train_performance_8p.sh +++ b/PyTorch/contrib/cv/detection/SOLOv2/test/train_performance_8p.sh @@ -96,7 +96,7 @@ do fi done wait -python tools/test_ins.py configs/solov2/solov2_r50_fpn_8gpu_1x.py work_dirs/solov2_release_r50_fpn_8gpu_1x/latest.pth --show \ +python3.7 tools/test_ins.py configs/solov2/solov2_r50_fpn_8gpu_1x.py work_dirs/solov2_release_r50_fpn_8gpu_1x/latest.pth --show \ --out results_solo.pkl --eval segm --data_root=$data_path >> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & wait #训练结束时间,不需要修改 diff --git a/PyTorch/contrib/cv/pose_estimation/Lightweight_OpenPose/test/train_full_8p.sh b/PyTorch/contrib/cv/pose_estimation/Lightweight_OpenPose/test/train_full_8p.sh index bfede3f4fc..e313b9b047 100644 --- a/PyTorch/contrib/cv/pose_estimation/Lightweight_OpenPose/test/train_full_8p.sh +++ b/PyTorch/contrib/cv/pose_estimation/Lightweight_OpenPose/test/train_full_8p.sh @@ -98,7 +98,16 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -python3.7.5 train.py \ +RANK_ID_START=0 +RANK_SIZE=8 + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + +KERNEL_NUM=$(($(nproc)/8)) +PID_START=$((KERNEL_NUM * RANK_ID)) +PID_END=$((PID_START + KERNEL_NUM - 1)) +taskset -c $PID_START-$PID_END python3.7.5 train.py \ --train-images-folder ${data_path}/train2017/ \ --prepared-train-labels ./prepared_train_annotation.pkl \ --val-labels ./val_subset.json \ @@ -118,12 +127,13 @@ python3.7.5 train.py \ --world-size=1 \ --dist-backend 'hccl' \ --amp \ + --gpu=${RANK_ID} \ --loss-scale=16 \ --opt-level O1 \ --device-list '0,1,2,3,4,5,6,7' \ --device="npu" \ > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & - +done wait # save best model per step diff --git a/PyTorch/contrib/cv/pose_estimation/Lightweight_OpenPose/test/train_performance_8p.sh b/PyTorch/contrib/cv/pose_estimation/Lightweight_OpenPose/test/train_performance_8p.sh index 09d08597d8..b83d7e5606 100644 --- a/PyTorch/contrib/cv/pose_estimation/Lightweight_OpenPose/test/train_performance_8p.sh +++ b/PyTorch/contrib/cv/pose_estimation/Lightweight_OpenPose/test/train_performance_8p.sh @@ -68,7 +68,16 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi -python3.7.5 train.py \ +RANK_ID_START=0 +RANK_SIZE=8 + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + +KERNEL_NUM=$(($(nproc)/8)) +PID_START=$((KERNEL_NUM * RANK_ID)) +PID_END=$((PID_START + KERNEL_NUM - 1)) +taskset -c $PID_START-$PID_END python3.7.5 train.py \ --train-images-folder ${data_path}/train2017/ \ --prepared-train-labels ./prepared_train_annotation.pkl \ --val-labels ./val_subset.json \ @@ -89,10 +98,11 @@ python3.7.5 train.py \ --dist-backend 'hccl' \ --loss-scale=16 \ --amp \ + --gpu=${RANK_ID} \ --opt-level O1 \ --device-list '0,1,2,3,4,5,6,7' \ > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & - +done wait #################获取训练数据################ diff --git a/PyTorch/contrib/cv/pose_estimation/Lightweight_OpenPose/train.py b/PyTorch/contrib/cv/pose_estimation/Lightweight_OpenPose/train.py index ce68710a1c..1340bf3f4a 100644 --- a/PyTorch/contrib/cv/pose_estimation/Lightweight_OpenPose/train.py +++ b/PyTorch/contrib/cv/pose_estimation/Lightweight_OpenPose/train.py @@ -161,10 +161,7 @@ def main(): args.world_size = ngpus_per_node * args.world_size args.distributed = args.world_size > 1 - if args.distributed: - mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) - else: - main_worker(args.gpu, ngpus_per_node, args) + main_worker(args.gpu, ngpus_per_node, args) def main_worker(gpu, ngpus_per_node, args): -- Gitee