From f076bcf4005c201b39c974f26d8208d5eb205e6e Mon Sep 17 00:00:00 2001
From: wuxingpeng <wxpeng2013@163.com>
Date: Mon, 11 Apr 2022 03:36:31 +0000
Subject: [PATCH 1/2] =?UTF-8?q?=E3=80=90PyTorch=E3=80=91=E3=80=90built-in?=
 =?UTF-8?q?=E3=80=91=E3=80=90MobileNetV2=5Ffor=5FPyTorch=E3=80=91=E6=B7=BB?=
 =?UTF-8?q?=E5=8A=A0=E5=9B=BE=E6=A8=A1=E5=BC=8F=E5=88=86=E6=94=AF=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../train/mobilenetv2_8p_main_anycard.py      | 73 +++++++++++++------
 1 file changed, 50 insertions(+), 23 deletions(-)

diff --git a/PyTorch/built-in/cv/classification/MobileNetV2_for_PyTorch/train/mobilenetv2_8p_main_anycard.py b/PyTorch/built-in/cv/classification/MobileNetV2_for_PyTorch/train/mobilenetv2_8p_main_anycard.py
index b38bdd8a85..550e23ab70 100644
--- a/PyTorch/built-in/cv/classification/MobileNetV2_for_PyTorch/train/mobilenetv2_8p_main_anycard.py
+++ b/PyTorch/built-in/cv/classification/MobileNetV2_for_PyTorch/train/mobilenetv2_8p_main_anycard.py
@@ -121,6 +121,10 @@ parser.add_argument('--opt-level', default='O2', type=str,
                     help='loss scale using in amp, default -1 means dynamic')
 parser.add_argument('--class-nums', default=1000, type=int, help='class-nums only for pretrain')
 
+# 图模式
+parser.add_argument('--graph_mode',
+                    action='store_true',
+                    help='whether to enable graph mode.')
 
 warnings.filterwarnings('ignore')
 best_acc1 = 0
@@ -341,6 +345,10 @@ def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, ar
     steps_per_epoch = train_loader_len
     print('==========step per epoch======================', steps_per_epoch)
     for i, (images, target) in enumerate(train_loader):
+        #图模式
+        if args.graph_mode:
+            print("graph mode on")
+            torch.npu.enable_graph_mode()
         if i > 200 :
             pass
         # measure data loading time
@@ -348,25 +356,34 @@ def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, ar
 
         global_step = epoch * steps_per_epoch + i
         lr = adjust_learning_rate(optimizer, global_step, steps_per_epoch, args)
+        #图模式
+        if args.graph_mode:
+            images = images.to(loc, non_blocking = True)
+            target = target.to(loc, non_blocking = True)
+            images = images.to(torch.float).sub(mean).div(std)
+            target = target.to(torch.int32)
+            # compute output
+            output = model(images)
+            loss = criterion(output, target)
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        else:
+            target = target.to(torch.int32)
+            images = images.to(loc, non_blocking=True).to(torch.float).sub(mean).div(std)
+            target = target.to(loc, non_blocking=True)
+            # compute output
+            output = model(images)
+            stream = torch.npu.current_stream()
+            stream.synchronize()
 
-        target = target.to(torch.int32)
-        images = images.to(loc, non_blocking=True).to(torch.float).sub(mean).div(std)
-        target = target.to(loc, non_blocking=True)
-
-        # compute output
-        output = model(images)
-        stream = torch.npu.current_stream()
-        stream.synchronize()
-
-        loss = criterion(output, target)
-        stream = torch.npu.current_stream()
-        stream.synchronize()
+            loss = criterion(output, target)
+            stream = torch.npu.current_stream()
+            stream.synchronize()
 
-        # measure accuracy and record loss
-        acc1, acc5 = accuracy(output, target, topk=(1, 5))
-        losses.update(loss.item(), images.size(0))
-        top1.update(acc1[0], images.size(0))
-        top5.update(acc5[0], images.size(0))
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), images.size(0))
+            top1.update(acc1[0], images.size(0))
+            top5.update(acc5[0], images.size(0))
 
         # compute gradient and do SGD step
         if args.benchmark == 0:
@@ -377,9 +394,10 @@ def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, ar
                 scaled_loss.backward()
         else:
             loss.backward()
-
-        stream = torch.npu.current_stream()
-        stream.synchronize()
+        #图模式
+        if not args.graph_mode:
+            stream = torch.npu.current_stream()
+            stream.synchronize()
 
         if args.benchmark == 0:
             optimizer.step()
@@ -392,8 +410,14 @@ def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, ar
                         param.grad /= batch_size_multiplier
                 optimizer.step()
                 optimizer.zero_grad()
-        stream = torch.npu.current_stream()
-        stream.synchronize()
+        #图模式
+        if args.graph_mode:
+            torch.npu.launch_graph()
+            if i == len(train_loader):
+                torch.npu.synchronize()
+        else:
+            stream = torch.npu.current_stream()
+            stream.synchronize()
 
         # measure elapsed time
         batch_time.update(time.time() - end)
@@ -403,7 +427,10 @@ def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, ar
             if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                                                         and args.rank % ngpus_per_node == 0):
                 progress.display(i)
-
+    #图模式
+    if args.graph_mode:
+        print("graph mode off")
+        torch.npu.disable_graph_mode()
     if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                                                 and args.rank % ngpus_per_node == 0):
         print("[npu id:", args.gpu, "]", '* FPS@all {:.3f}'.format(ngpus_per_node * args.batch_size / batch_time.avg))
-- 
Gitee


From 82953bd137c1736d4a803d7f9a05746c74cb055a Mon Sep 17 00:00:00 2001
From: wuxingpeng <wxpeng2013@163.com>
Date: Mon, 11 Apr 2022 03:38:50 +0000
Subject: [PATCH 2/2] =?UTF-8?q?=E3=80=90PyTorch=E3=80=91=E3=80=90built-in?=
 =?UTF-8?q?=E3=80=91=E3=80=90MobileNetV2=5Ffor=5FPyTorch=E3=80=91=E6=96=B0?=
 =?UTF-8?q?=E5=A2=9Etrain=5FID3072=5FMobileNetV2=5Fperformance=5F1p.sh?=
 =?UTF-8?q?=E4=B8=BA=E5=9B=BE=E6=A8=A1=E5=BC=8F=E6=89=A7=E8=A1=8C=E8=84=9A?=
 =?UTF-8?q?=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...train_ID3072_MobileNetV2_performance_1p.sh | 164 ++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 PyTorch/built-in/cv/classification/MobileNetV2_for_PyTorch/test/train_ID3072_MobileNetV2_performance_1p.sh

diff --git a/PyTorch/built-in/cv/classification/MobileNetV2_for_PyTorch/test/train_ID3072_MobileNetV2_performance_1p.sh b/PyTorch/built-in/cv/classification/MobileNetV2_for_PyTorch/test/train_ID3072_MobileNetV2_performance_1p.sh
new file mode 100644
index 0000000000..86b6ec9d5e
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/MobileNetV2_for_PyTorch/test/train_ID3072_MobileNetV2_performance_1p.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+
+#当前路径,不需要修改
+cur_path=`pwd`
+
+
+#集合通信参数,不需要修改
+export HCCL_WHITELIST_DISABLE=1
+export RANK_SIZE=1
+export JOB_ID=10087
+RANK_ID_START=0
+# source env.sh
+#RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#设置默认日志级别,不需要修改
+# export ASCEND_GLOBAL_LOG_LEVEL_ETP=3
+
+#基础参数，需要模型审视修改
+#网络名称，同目录名称
+Network="MobileNetV2_ID3072_for_PyTorch"
+#训练epoch
+train_epochs=1
+#训练batch_size
+batch_size=512
+#训练step
+train_steps=`expr 1281167 / ${batch_size}`
+#学习率
+learning_rate=0.045
+
+#维测参数，precision_mode需要模型审视修改
+precision_mode="allow_mix_precision"
+#维持参数，以下不需要修改
+over_dump=False
+data_dump_flag=False
+data_dump_step="10"
+profiling=False
+
+
+if [[ $1 == --help || $1 == --h ]];then
+   echo "usage:./train_performance_1p.sh --data_path=data_dir --batch_size=1024 --learning_rate=0.04"
+   exit 1
+fi
+
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+      data_path=`echo ${para#*=}`
+    elif [[ $para == --batch_size* ]];then
+      batch_size=`echo ${para#*=}`
+    elif [[ $para == --learning_rate* ]];then
+      learning_rate=`echo ${para#*=}`
+    elif [[ $para == --precision_mode* ]];then
+        precision_mode=`echo ${para#*=}`
+    fi
+done
+
+PREC=""
+if [[ $precision_mode == "amp" ]];then
+  PREC="--amp"
+fi
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+ 
+cd $cur_path
+
+#设置环境变量，不需要修改
+echo "Device ID: $ASCEND_DEVICE_ID"
+export RANK_ID=$RANK_ID
+
+if [ -d $cur_path/output ];then
+   rm -rf $cur_path/output/*
+   mkdir -p $cur_path/output/$ASCEND_DEVICE_ID
+else
+   mkdir -p $cur_path/output/$ASCEND_DEVICE_ID
+fi
+wait
+
+#参数修改
+sed -i "s|pass|break|g" ${cur_path}/../train/mobilenetv2_8p_main_anycard.py
+wait
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+
+# 绑核，不需要的绑核的模型删除，需要模型审视修改
+python3.7 ${cur_path}/../train/mobilenetv2_8p_main_anycard.py \
+    --addr=$(hostname -I |awk '{print $1}') \
+    --seed 49  \
+    --workers 128 \
+    --lr 0.05 \
+    --print-freq 1 \
+    --eval-freq 1 \
+    --dist-url 'tcp://127.0.0.1:50002' \
+    --dist-backend 'hccl' \
+    --multiprocessing-distributed \
+    --world-size 1 \
+    --class-nums 1000 \
+    --batch-size $batch_size \
+    --epochs $train_epochs \
+    --rank 0 \
+    --device-list $ASCEND_DEVICE_ID \
+    --amp \
+    --benchmark 0 \
+    --graph_mode \
+    --data $data_path > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#参数改回
+sed -i "s|break|pass|g" ${cur_path}/../train/mobilenetv2_8p_main_anycard.py
+wait
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep FPS ${cur_path}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $NF}'|awk '{sum+=$1} END {print  sum/NR}'`
+
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+#train_accuracy=`grep -a '* Acc@1' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+
+#打印，不需要修改
+#echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Epoch $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F 'Loss' '{print $2}' |awk '{print $1}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-- 
Gitee