From ea68bd0de2fcd35de8e20c94fd8e24acc238f148 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=91=A8=E6=98=8A?=
 <11116189+Oliver-H-Chow@user.noreply.gitee.com>
Date: Tue, 7 Jun 2022 07:35:10 +0000
Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20PyTorch/?=
 =?UTF-8?q?contrib/cv/classification/WideResNet101=5F2=5Ffor=5FPytorch/mai?=
 =?UTF-8?q?n=5Fnpu=5F1p.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

删除文件 PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/main_npu_8p.py

删除文件 PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/README.md

【西安交通大学】【高校贡献】【PyTorch】 WideResNet101_2_for_Pytorch 初次提交
    # WideResNet101_2_for_Pytorch
    − 参考实现：
    ```
    url=https://github.com/pytorch/examples/tree/master/imagenet
    branch=master
    commit_id=0487749de2fd36f01f4f7f5877b5c9a28ec1fa7f
    ```
    - 精度性能

| Torch    | Acc@1    | FPS      | Npu_nums | Epochs   |
| :------: | :------: | :------: | :------: | :------: |
| 1.8      | -        | 394      | 1        | 1        |
| 1.8      | 78.625   | 2929     | 8        | 90       |
| :------: | :------: | :------: | :------: | :------: |
| 1.5      | -        | 386      | 1        | 1        |
| 1.5      | 78.627   | 3109     | 8        | 90       |

    # 自验报告
    ```shell
    # 1p train perf
    # 是否正确输出了性能log文件
    bash train_performance_1p.sh
    # 验收结果： OK

    # 8p train perf
    # 是否正确输出了性能log文件
    bash train_performance_8p.sh
    # 验收结果： OK

    # 1p train full
    # 是否正确输出了性能精度log文件，是否正确保存了模型文件
    bash train_full_1p.sh
    # 验收结果： OK
    # 备注：无输出日志，运行报错，报错日志等

    # 8p train full
    # 是否正确输出了性能精度log文件，是否正确保存了模型文件
    bash train_full_8p.sh
    # 验收结果： OK
    # 备注：无输出日志，运行报错，报错日志等

删除文件 PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_full_1p.sh

删除文件 PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_full_8p.sh

删除文件 PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_performance_8p.sh

删除文件 PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_performance_1p.sh

【西安交通大学】【高校贡献】【PyTorch】 WideResNet101_2_for_Pytorch 初次提交
    # WideResNet101_2_for_Pytorch
    − 参考实现：
    ```
    url=https://github.com/pytorch/examples/tree/master/imagenet
    branch=master
    commit_id=0487749de2fd36f01f4f7f5877b5c9a28ec1fa7f
    ```
    - 精度性能

| Torch    | Acc@1    | FPS      | Npu_nums | Epochs   |
| :------: | :------: | :------: | :------: | :------: |
| 1.8      | -        | 394      | 1        | 1        |
| 1.8      | 78.625   | 2929     | 8        | 90       |
| :------: | :------: | :------: | :------: | :------: |
| 1.5      | -        | 386      | 1        | 1        |
| 1.5      | 78.627   | 3109     | 8        | 90       |

    # 自验报告
    ```shell
    # 1p train perf
    # 是否正确输出了性能log文件
    bash train_performance_1p.sh
    # 验收结果： OK

    # 8p train perf
    # 是否正确输出了性能log文件
    bash train_performance_8p.sh
    # 验收结果： OK

    # 1p train full
    # 是否正确输出了性能精度log文件，是否正确保存了模型文件
    bash train_full_1p.sh
    # 验收结果： OK
    # 备注：无输出日志，运行报错，报错日志等

    # 8p train full
    # 是否正确输出了性能精度log文件，是否正确保存了模型文件
    bash train_full_8p.sh
    # 验收结果： OK
    # 备注：无输出日志，运行报错，报错日志等
---
 .../WideResNet101_2_for_Pytorch/README.md     | 11 ++-
 .../WideResNet101_2_for_Pytorch/README_raw.md | 80 +++++++++++++++++++
 .../main_npu_1p.py                            | 44 +++++-----
 .../main_npu_8p.py                            | 48 ++++++-----
 .../test/train_full_1p.sh                     |  4 +-
 .../test/train_full_8p.sh                     |  4 +-
 .../test/train_performance_1p.sh              |  7 +-
 .../test/train_performance_8p.sh              |  4 +-
 8 files changed, 146 insertions(+), 56 deletions(-)
 create mode 100644 PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/README_raw.md

diff --git a/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/README.md b/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/README.md
index 1fad48e94b..479b854df0 100644
--- a/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/README.md
+++ b/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/README.md
@@ -42,10 +42,13 @@ bash ./test/train_eval_8p.sh --data_path=real_data_path
 
 ## WideResnet101_2 training result
 
-| Acc@1    | FPS       | Npu_nums | Epochs   | AMP_Type |
-| :------: | :------:  | :------: | :------: | :------: |
-| -        | 386       | 1        | 1        | O2       |
-| 78.627   | 3109      | 8        | 90       | O2       |
+| Torch    | Acc@1    | FPS      | Npu_nums | Epochs   | AMP_Type |
+| :------: | :------: | :------: | :------: | :------: | :------: |
+| 1.8      | -        | 394      | 1        | 1        | O2       |
+| 1.8      | 78.625   | 2929     | 8        | 90       | O2       |
+| :------: | :------: | :------: | :------: | :------: | :------: |
+| 1.5      | -        | 386      | 1        | 1        | O2       |
+| 1.5      | 78.627   | 3109     | 8        | 90       | O2       |
 
 ```
 
diff --git a/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/README_raw.md b/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/README_raw.md
new file mode 100644
index 0000000000..1fad48e94b
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/README_raw.md
@@ -0,0 +1,80 @@
+# WideResnet101_2
+
+This implements training of WideResnet101_2 on the ImageNet dataset, mainly modified from [pytorch/examples](https://github.com/pytorch/examples/tree/master/imagenet).
+
+## WideResnet101_2 Detail
+
+As of the current date, Ascend-Pytorch is still inefficient for contiguous operations.Therefore, WideResnet101_2 is re-implemented using semantics such as custom OP.
+
+## Requirements
+
+- Install PyTorch ([pytorch.org](http://pytorch.org))
+- `pip install -r requirements.txt`
+- Download the ImageNet dataset from http://www.image-net.org/
+    - Then, and move validation images to labeled subfolders, using [the following shell script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh)
+
+## Training
+
+To train a model, run `main_npu_1p.py` or `main_npu_8p.py` with the desired model architecture and the path to the ImageNet dataset:
+
+```bash
+# training 1p accuracy
+bash ./test/train_full_1p.sh --data_path=real_data_path
+
+# training 1p performance
+bash ./test/train_performance_1p.sh --data_path=real_data_path
+
+# training 8p accuracy
+bash ./test/train_full_8p.sh --data_path=real_data_path
+
+# training 8p performance
+bash ./test/train_performance_8p.sh --data_path=real_data_path
+```
+
+Log path:
+    test/output/devie_id/train_${device_id}.log           # training detail log
+    test/output/devie_id/WideReesnet50_2_bs8192_8p_perf.log  # 8p training performance result log
+    test/output/devie_id/WideReesnet50_2_bs8192_8p_acc.log   # 8p training accuracy result log
+
+# eval default 8p， should support 1p
+bash ./test/train_eval_8p.sh --data_path=real_data_path  
+```
+
+## WideResnet101_2 training result
+
+| Acc@1    | FPS       | Npu_nums | Epochs   | AMP_Type |
+| :------: | :------:  | :------: | :------: | :------: |
+| -        | 386       | 1        | 1        | O2       |
+| 78.627   | 3109      | 8        | 90       | O2       |
+
+```
+
+## Inference
+Download the mindx SDK development kit(https://www.hiascend.com/software/mindx-sdk/sdk-detail), version:2.0.2
+then Compile inference image, start the docker
+```
+docker build -t infer_image --build-arg FROM_IMAGE_NAME=base_image:tag --build-arg SDK_PKG=sdk_pkg
+bash docker_start_infer.sh docker_image model_dir
+```
+
+# mxbase
+configure environment variables and modify label_file and offline_inference model path in opencv.cpp.
+then, Execute the program and start inference, 
+```
+bash build.sh
+./wideresnet [val_image_path]
+```
+calculate the inference accuracy
+```
+python3.7 classification_task_metric.py result/ ../../data/config/val_label.txt . ./result.json
+cat result.json
+```
+
+# sdk
+run ''' python main.py --help ''' to view the parameter details and modify them accordingly.
+then, start inference and calculate the inference accuracy
+```
+bash run.sh ../../data/input/result
+python3.7 classification_task_metric.py result/ ../../data/config/val_label.txt . ./result.json
+cat result.json
+```
diff --git a/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/main_npu_1p.py b/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/main_npu_1p.py
index 0857a3b661..f640bd8764 100644
--- a/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/main_npu_1p.py
+++ b/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/main_npu_1p.py
@@ -24,6 +24,10 @@ import torch.npu
 from apex import amp
 
 import torch
+
+if torch.__version__ >= "1.8.1":
+    import torch_npu
+
 import torch.nn as nn
 import torch.nn.parallel
 import torch.backends.cudnn as cudnn
@@ -41,18 +45,18 @@ from models import resnet_0_6_0
 CALCULATE_DEVICE = "npu:0"
 
 model_names = sorted(name for name in models.__dict__
-    if name.islower() and not name.startswith("__")
-    and callable(models.__dict__[name]))
+                     if name.islower() and not name.startswith("__")
+                     and callable(models.__dict__[name]))
 
 parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
-parser.add_argument('data', metavar='DIR',
+parser.add_argument('data', metavar='DIR', default="/opt/npu/imagenet",
                     help='path to dataset')
 parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
                     choices=model_names,
                     help='model architecture: ' +
-                        ' | '.join(model_names) +
-                        ' (default: resnet18)')
-parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                         ' | '.join(model_names) +
+                         ' (default: resnet18)')
+parser.add_argument('-j', '--workers', default=128, type=int, metavar='N',
                     help='number of data loading workers (default: 4)')
 parser.add_argument('--epochs', default=90, type=int, metavar='N',
                     help='number of total epochs to run')
@@ -105,8 +109,8 @@ parser.add_argument('--amp', default=False, action='store_true',
                     help='use amp to train the model')
 parser.add_argument('--warm_up_epochs', default=0, type=int,
                     help='warm up')
-parser.add_argument('--loss-scale', default=1024., type=float,
-                    help='loss scale using in amp, default -1 means dynamic')
+parser.add_argument('--loss-scale', default='dynamic', type=str,
+                    help='loss scale using in amp, default dynamic')
 parser.add_argument('--opt-level', default='O2', type=str,
                     help='loss scale using in amp, default -1 means dynamic')
 parser.add_argument('--prof', default=False, action='store_true',
@@ -130,7 +134,6 @@ parser.add_argument('--model_url',
 parser.add_argument('--onnx', default=True, action='store_true',
                     help="convert pth model to onnx")
 
-
 cur_step = 0
 CACHE_TRAINING_URL = "/cache/training/"
 CACHE_DATA_URL = "/cache/data_url"
@@ -222,7 +225,7 @@ def main_worker(gpu, ngpus_per_node, args):
             pretrained_dict.pop('fc.bias')
         for param in model.parameters():
             param.requires_grad = False
-        model.fc = nn.Linear(2048,1000)
+        model.fc = nn.Linear(2048, 1000)
     else:
         print("=> creating model wide_resnet101_2")
         model = resnet_0_6_0.wide_resnet101_2()
@@ -232,7 +235,7 @@ def main_worker(gpu, ngpus_per_node, args):
     # elif args.distributed:
     ###### modify npu_p1 2######
     if args.distributed:
-    ###### modify npu_p1 2 end ######
+        ###### modify npu_p1 2 end ######
         # For multiprocessing distributed, DistributedDataParallel constructor
         # should always set the single device scope, otherwise,
         # DistributedDataParallel will use all available devices.
@@ -271,9 +274,9 @@ def main_worker(gpu, ngpus_per_node, args):
     criterion = nn.CrossEntropyLoss().to(CALCULATE_DEVICE)
     ############## npu modify 4 end #############
     optimizer = apex.optimizers.NpuFusedSGD(model.parameters(), args.lr,
-                                momentum=args.momentum,
-                                nesterov=True,
-                                weight_decay=args.weight_decay)
+                                            momentum=args.momentum,
+                                            nesterov=True,
+                                            weight_decay=args.weight_decay)
     ###### modify 1 ######
     if args.amp:
         model, optimizer = amp.initialize(
@@ -370,7 +373,7 @@ def main_worker(gpu, ngpus_per_node, args):
         best_acc1 = max(acc1, best_acc1)
 
         if not args.multiprocessing_distributed or (args.multiprocessing_distributed
-                and args.rank % ngpus_per_node == 0):
+                                                    and args.rank % ngpus_per_node == 0):
             save_checkpoint({
                 'epoch': epoch + 1,
                 'arch': args.arch,
@@ -420,6 +423,7 @@ def profiling(data_loader, model, criterion, optimizer, args):
 
     prof.export_chrome_trace("output.prof")
 
+
 def train(train_loader, model, criterion, optimizer, epoch, args, ngpus_per_node):
     batch_time = AverageMeter('Time', ':6.3f')
     data_time = AverageMeter('Data', ':6.3f')
@@ -454,7 +458,7 @@ def train(train_loader, model, criterion, optimizer, epoch, args, ngpus_per_node
         loss = criterion(output, target)
 
         # measure accuracy and record loss
-        acc1, acc5 = accuracy(output, target, topk=(1, 5))# pylint: disable=unbalanced-tuple-unpacking
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))  # pylint: disable=unbalanced-tuple-unpacking
         losses.update(loss.item(), images.size(0))
         top1.update(acc1[0], images.size(0))
         top5.update(acc5[0], images.size(0))
@@ -475,10 +479,10 @@ def train(train_loader, model, criterion, optimizer, epoch, args, ngpus_per_node
         batch_time.update(time.time() - end)
         end = time.time()
 
-    ###### modify 4 ######
+        ###### modify 4 ######
         if i % args.print_freq == 0:
             if not args.multiprocessing_distributed or (args.multiprocessing_distributed
-                                                          and args.rank % ngpus_per_node == 0):
+                                                        and args.rank % ngpus_per_node == 0):
                 progress.display(i)
 
     if not args.multiprocessing_distributed or (args.multiprocessing_distributed
@@ -495,7 +499,7 @@ def train(train_loader, model, criterion, optimizer, epoch, args, ngpus_per_node
 
 def validate(val_loader, model, criterion, args):
     ###### modify 5 ######
-    batch_time = AverageMeter('Time', ':6.3f', start_count_index= 5)
+    batch_time = AverageMeter('Time', ':6.3f', start_count_index=5)
     ###### modify 5 end ######
     losses = AverageMeter('Loss', ':.4e')
     top1 = AverageMeter('Acc@1', ':6.2f')
@@ -523,7 +527,7 @@ def validate(val_loader, model, criterion, args):
             loss = criterion(output, target)
 
             # measure accuracy and record loss
-            acc1, acc5 = accuracy(output, target, topk=(1, 5))# pylint: disable=unbalanced-tuple-unpacking
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))  # pylint: disable=unbalanced-tuple-unpacking
             losses.update(loss.item(), images.size(0))
             top1.update(acc1[0], images.size(0))
             top5.update(acc5[0], images.size(0))
diff --git a/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/main_npu_8p.py b/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/main_npu_8p.py
index 0f55c67c7b..1b38bf62a3 100644
--- a/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/main_npu_8p.py
+++ b/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/main_npu_8p.py
@@ -25,6 +25,9 @@ from apex import amp
 import math
 
 import torch
+
+if torch.__version__ >= "1.8.1":
+    import torch_npu
 import torch.nn as nn
 import torch.nn.parallel
 import torch.backends.cudnn as cudnn
@@ -40,18 +43,18 @@ import models
 from models import resnet_0_6_0
 
 model_names = sorted(name for name in models.__dict__
-    if name.islower() and not name.startswith("__")
-    and callable(models.__dict__[name]))
+                     if name.islower() and not name.startswith("__")
+                     and callable(models.__dict__[name]))
 
 parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
-parser.add_argument('data', metavar='DIR',
+parser.add_argument('data', metavar='DIR', default="/opt/npu/imagenet",
                     help='path to dataset')
 parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
                     choices=model_names,
                     help='model architecture: ' +
-                        ' | '.join(model_names) +
-                        ' (default: resnet18)')
-parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                         ' | '.join(model_names) +
+                         ' (default: resnet18)')
+parser.add_argument('-j', '--workers', default=128, type=int, metavar='N',
                     help='number of data loading workers (default: 4)')
 parser.add_argument('--epochs', default=90, type=int, metavar='N',
                     help='number of total epochs to run')
@@ -103,8 +106,8 @@ parser.add_argument('--amp', default=False, action='store_true',
                     help='use amp to train the model')
 parser.add_argument('--warm_up_epochs', default=0, type=int,
                     help='warm up')
-parser.add_argument('--loss-scale', default=1024., type=float,
-                    help='loss scale using in amp, default -1 means dynamic')
+parser.add_argument('--loss-scale', default='dynamic', type=str,
+                    help='loss scale using in amp, default dynamic')
 parser.add_argument('--opt-level', default='O2', type=str,
                     help='loss scale using in amp, default -1 means dynamic')
 parser.add_argument('--prof', default=False, action='store_true',
@@ -114,6 +117,7 @@ parser.add_argument('--save_path', default='', type=str,
 
 best_acc1 = 0
 
+
 def device_id_to_process_device_map(device_list):
     devices = device_list.split(",")
     devices = [int(x) for x in devices]
@@ -125,6 +129,7 @@ def device_id_to_process_device_map(device_list):
 
     return process_device_map
 
+
 def main():
     args = parser.parse_args()
 
@@ -205,7 +210,7 @@ def main_worker(gpu, ngpus_per_node, args):
         model.load_state_dict(pretrained_dict, strict=False)
         for param in model.parameters():
             param.requires_grad = False
-        model.fc = nn.Linear(2048,1000)
+        model.fc = nn.Linear(2048, 1000)
     else:
         print("=> creating model wide_resnet101_2")
         model = resnet_0_6_0.wide_resnet101_2()
@@ -220,7 +225,6 @@ def main_worker(gpu, ngpus_per_node, args):
     args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
     ############## npu modify end #############
 
-
     # Data loading code
     traindir = os.path.join(args.data, 'train')
     valdir = os.path.join(args.data, 'val')
@@ -259,9 +263,9 @@ def main_worker(gpu, ngpus_per_node, args):
     # define loss function (criterion) and optimizer
     criterion = nn.CrossEntropyLoss().to(loc)
     optimizer = apex.optimizers.NpuFusedSGD(model.parameters(), args.lr,
-                                momentum=args.momentum,
-                                nesterov=True,
-                                weight_decay=args.weight_decay)
+                                            momentum=args.momentum,
+                                            nesterov=True,
+                                            weight_decay=args.weight_decay)
 
     if args.amp:
         model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale)
@@ -277,7 +281,7 @@ def main_worker(gpu, ngpus_per_node, args):
             model.load_state_dict(checkpoint['state_dict'])
             optimizer.load_state_dict(checkpoint['optimizer'])
             if args.amp:
-              amp.load_state_dict(checkpoint['amp'])
+                amp.load_state_dict(checkpoint['amp'])
             print("=> loaded checkpoint '{}' (epoch {})"
                   .format(args.resume, checkpoint['epoch']))
         else:
@@ -310,14 +314,14 @@ def main_worker(gpu, ngpus_per_node, args):
         best_acc1 = max(acc1, best_acc1)
 
         if not args.multiprocessing_distributed or (args.multiprocessing_distributed
-                and args.rank % ngpus_per_node == 0):
+                                                    and args.rank % ngpus_per_node == 0):
             if args.amp:
                 save_checkpoint({
                     'epoch': epoch + 1,
                     'arch': args.arch,
                     'state_dict': model.state_dict(),
                     'best_acc1': best_acc1,
-                    'optimizer' : optimizer.state_dict(),
+                    'optimizer': optimizer.state_dict(),
                     'amp': amp.state_dict(),
                 }, is_best)
             else:
@@ -326,7 +330,7 @@ def main_worker(gpu, ngpus_per_node, args):
                     'arch': args.arch,
                     'state_dict': model.state_dict(),
                     'best_acc1': best_acc1,
-                    'optimizer' : optimizer.state_dict(),
+                    'optimizer': optimizer.state_dict(),
                 }, is_best)
 
 
@@ -367,6 +371,7 @@ def profiling(data_loader, model, criterion, optimizer, args):
 
     prof.export_chrome_trace("output.prof")
 
+
 def train(train_loader, model, criterion, optimizer, epoch, args, ngpus_per_node):
     batch_time = AverageMeter('Time', ':6.3f')
     data_time = AverageMeter('Data', ':6.3f')
@@ -416,7 +421,7 @@ def train(train_loader, model, criterion, optimizer, epoch, args, ngpus_per_node
         batch_time.update(time.time() - end)
         end = time.time()
 
-    ###### modify 4 ######
+        ###### modify 4 ######
         if i % args.print_freq == 0:
             if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                                                         and args.rank % ngpus_per_node == 0):
@@ -433,7 +438,7 @@ def train(train_loader, model, criterion, optimizer, epoch, args, ngpus_per_node
 
 def validate(val_loader, model, criterion, args, ngpus_per_node):
     ###### modify 5 ######
-    batch_time = AverageMeter('Time', ':6.3f', start_count_index= 5)
+    batch_time = AverageMeter('Time', ':6.3f', start_count_index=5)
     ###### modify 5 end ######
     losses = AverageMeter('Loss', ':.4e')
     top1 = AverageMeter('Acc@1', ':6.2f')
@@ -472,11 +477,10 @@ def validate(val_loader, model, criterion, args, ngpus_per_node):
                                                             and args.rank % ngpus_per_node == 0):
                     progress.display(i)
 
-
         if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                                                     and args.rank % ngpus_per_node == 0):
             print("[npu id:", args.gpu, "]", '[AVG-ACC] * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
-                      .format(top1=top1, top5=top5))
+                  .format(top1=top1, top5=top5))
 
     return top1.avg
 
@@ -572,4 +576,4 @@ def accuracy(output, target, topk=(1,)):
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_full_1p.sh b/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_full_1p.sh
index 428550171f..ad1c12715b 100644
--- a/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_full_1p.sh
+++ b/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_full_1p.sh
@@ -111,16 +111,16 @@ python3.7 -u ./main_npu_1p.py \
     "${data_path}" \
     --lr=0.2 \
     --print-freq=10 \
+    --workers=128 \
     --epochs=${train_epochs} \
     --amp \
-    --loss-scale=128.0 \
+    --loss-scale='dynamic' \
     --opt-level='O2' \
     --device='npu' \
     --world-size=1 \
     --npu=${ASCEND_DEVICE_ID} \
     --save_path=${test_path_dir}/train_1p_${start_time} \
     --batch-size=${batch_size} > ${test_path_dir}/train_1p_${start_time}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
-
 wait
 
 ##################获取训练数据##################
diff --git a/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_full_8p.sh b/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_full_8p.sh
index 00669260e6..955ac88c38 100644
--- a/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_full_8p.sh
+++ b/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_full_8p.sh
@@ -100,7 +100,7 @@ else
     mkdir -p ${test_path_dir}/train_8p_${start_time}
 fi
 
-python3.7 -u ./main_npu_8p.py \
+nohup python -u ./main_npu_8p.py \
     "${data_path}" \
     --addr=$(hostname -I |awk '{print $1}') \
     --lr=${learning_rate} \
@@ -112,7 +112,7 @@ python3.7 -u ./main_npu_8p.py \
     --world-size=1 \
     --dist-backend='hccl' \
     --multiprocessing-distributed \
-    --loss-scale=128.0 \
+    --loss-scale='dynamic' \
     --opt-level='O2' \
     --device='npu' \
     --rank=0 \
diff --git a/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_performance_1p.sh
index 0d05811ca3..a874cb4e58 100644
--- a/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_performance_1p.sh
+++ b/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_performance_1p.sh
@@ -29,6 +29,7 @@ over_dump=False
 data_dump_flag=False
 data_dump_step="10"
 profiling=False
+#profiling=True
 
 # 帮助信息，不需要修改
 if [[ $1 == --help || $1 == -h ]];then
@@ -58,7 +59,7 @@ done
 
 #校验是否传入data_path,不需要修改
 if [[ $data_path == "" ]];then
-    echo "[Error] para \"data_path\" must be confing"
+    echo "[Error] para \"data_path\" must be config"
     exit 1
 fi
 
@@ -112,13 +113,13 @@ python3.7 -u ./main_npu_1p.py \
     --print-freq=10 \
     --epochs=${train_epochs} \
     --amp \
-    --loss-scale=128.0 \
+    --loss-scale='dynamic' \
     --opt-level='O2' \
+    --workers=128 \
     --device='npu' \
     --world-size=1 \
     --npu=${ASCEND_DEVICE_ID} \
     --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
-
 wait
 
 ##################获取训练数据##################
diff --git a/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_performance_8p.sh b/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_performance_8p.sh
index 07b51f99d8..43fe8de45d 100644
--- a/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_performance_8p.sh
+++ b/PyTorch/contrib/cv/classification/WideResNet101_2_for_Pytorch/test/train_performance_8p.sh
@@ -106,14 +106,12 @@ python3.7 -u ./main_npu_8p.py \
     --world-size=1 \
     --dist-backend='hccl' \
     --multiprocessing-distributed \
-    --loss-scale=128.0 \
+    --loss-scale='dynamic' \
     --opt-level='O2' \
     --device='npu' \
     --rank=0 \
     --warm_up_epochs=5 \
     --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
-
-
 wait
 
 ##################获取训练数据##################
-- 
Gitee