diff --git a/PyTorch/built-in/others/WDL_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/others/WDL_for_PyTorch/test/train_performance_1p.sh index ab5c7e2f46fe71c0a7bda39dc10b4ff6aeda88b5..0db1d3a3fea2816280fd9b396e3b039cf253cf98 100644 --- a/PyTorch/built-in/others/WDL_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/built-in/others/WDL_for_PyTorch/test/train_performance_1p.sh @@ -1,127 +1,127 @@ -#!/bin/bash -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# less required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -# 设置环境变量 -source ./env.sh - -#当前路径,不需要修改 -cur_path=`pwd` - -#集合通信参数,不需要修改 -export RANK_SIZE=1 - -# 数据集路径,保持为空,不需要修改 -data_path="" - -#网络名称,同目录名称,需要模型审视修改 -Network="WDL_for_PyTorch" - -#训练batch_size,,需要模型审视修改 -batch_size=4096 - -#参数校验,不需要修改 -for para in $* -do - if [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - fi -done - -#校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" - exit 1 -fi - -#训练开始时间,不需要修改 -start_time=$(date +%s) - -device_id=0 -# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 -if [ $ASCEND_DEVICE_ID ];then - echo "device id is ${ASCEND_DEVICE_ID}" -elif [ ${device_id} ];then - export ASCEND_DEVICE_ID=${device_id} - echo "device id is ${ASCEND_DEVICE_ID}" -else - "[Error] device id must be config" - exit 1 -fi - -#进入训练脚本目录,需要模型审视修改 -cd $cur_path/.. -#创建DeviceID输出目录,不需要修改 -if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt -else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt -fi - -#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 -python3.7 run_classification_criteo_wdl.py \ - --data_path=$data_path \ - --amp \ - --steps 1000 \ - --sparse_embed_dim 4 \ - --batch_size 4096 \ - --lr 0.0009 \ - --device_id=$ASCEND_DEVICE_ID > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & - -wait - -#训练结束时间,不需要修改 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - -#结果打印,不需要修改 -echo "------------------ Final result ------------------" -#输出性能FPS,需要模型审视修改 -FPS=`grep -a 'Epoch' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep avg_sample_per_sec | awk -F "avg_sample_per_sec: " '{print $NF}'| awk -F "," '{print $1}'|awk 'NR==1{max=$1;next}{max=max>$1?max:$1}END{print max}'` -#打印,不需要修改 -echo "Final Performance images/sec : $FPS" - -#打印,不需要修改 -echo "E2E Training Duration sec : $e2e_time" - -#性能看护结果汇总 -#训练用例信息,不需要修改 -BatchSize=${batch_size} -DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' - -##获取性能数据,不需要修改 -#吞吐量 -ActualFPS=${FPS} -#单迭代训练时长 -TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` - -#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要模型审视修改 -grep -a 'Step' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "loss: " '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt - -#最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` - -#关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# less required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +# 设置环境变量 +source ./env.sh + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=1 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#网络名称,同目录名称,需要模型审视修改 +Network="WDL_for_PyTorch" + +#训练batch_size,,需要模型审视修改 +batch_size=4096 + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +device_id=0 +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/.. +#创建DeviceID输出目录,不需要修改 +if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt +else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt +fi + +#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +python3.7 run_classification_criteo_wdl.py \ + --data_path=$data_path \ + --amp \ + --steps 1000 \ + --sparse_embed_dim 4 \ + --batch_size 4096 \ + --lr 0.0009 \ + --device_id=$ASCEND_DEVICE_ID > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -a 'Epoch' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep avg_sample_per_sec | awk -F "avg_sample_per_sec: " '{print $NF}'| awk -F "," '{print $1}'|awk 'NR==1{max=$1;next}{max=max>$1?max:$1}END{print max}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#打印,不需要修改 +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要模型审视修改 +grep -a 'Step' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "loss: " '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/ResNet18_ID1593_for_PyTorch/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/ResNet18_ID1593_for_PyTorch/test/train_performance_1p.sh index ce8c3f635302d6ce99f9711b1f4b9330843c98d9..657706a12a95b869f2898dfa77ad85a5207d0aeb 100644 --- a/PyTorch/contrib/cv/classification/ResNet18_ID1593_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/contrib/cv/classification/ResNet18_ID1593_for_PyTorch/test/train_performance_1p.sh @@ -114,7 +114,7 @@ python3 ./main.py \ --dist-backend 'hccl' \ --epochs=${train_epochs} \ --batch-size=${batch_size} \ - --stop-step-num=128 \ + --stop-step-num=5003 \ --device_list=${ASCEND_DEVICE_ID} \ --amp > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &