From 32c84ceb6d8e034372abdd5a26efd7d7154c2b90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com> Date: Fri, 25 Mar 2022 02:36:40 +0000 Subject: [PATCH 01/12] update --- .../cv/classification/Vgg16_ID1630_for_PyTorch/vgg.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/PyTorch/contrib/cv/classification/Vgg16_ID1630_for_PyTorch/vgg.py b/PyTorch/contrib/cv/classification/Vgg16_ID1630_for_PyTorch/vgg.py index b28b64969f..2f87dff860 100644 --- a/PyTorch/contrib/cv/classification/Vgg16_ID1630_for_PyTorch/vgg.py +++ b/PyTorch/contrib/cv/classification/Vgg16_ID1630_for_PyTorch/vgg.py @@ -55,13 +55,11 @@ class VGG(nn.Module): x = self.fc1(x) x = self.relu(x) if self.training: - x = x.cpu() - x = self.drop(x).npu() + x = self.drop(x) x = self.fc2(x) x = self.relu(x) if self.training: - x = x.cpu() - x = self.drop(x).npu() + x = self.drop(x) x = self.fc3(x) return x -- Gitee From 2b4fc6df5168e0473eec3067a0b2613338e89dbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com> Date: Fri, 25 Mar 2022 02:41:46 +0000 Subject: [PATCH 02/12] update --- .../cv/classification/Vgg16_ID1630_for_PyTorch/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PyTorch/contrib/cv/classification/Vgg16_ID1630_for_PyTorch/main.py b/PyTorch/contrib/cv/classification/Vgg16_ID1630_for_PyTorch/main.py index 7b5057c7d3..c85de5b578 100644 --- a/PyTorch/contrib/cv/classification/Vgg16_ID1630_for_PyTorch/main.py +++ b/PyTorch/contrib/cv/classification/Vgg16_ID1630_for_PyTorch/main.py @@ -77,7 +77,7 @@ import numpy as np parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') parser.add_argument('--data', metavar='DIR', default='/dataset/imagenet', help='path to dataset') -parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', +parser.add_argument('-j', '--workers', default=128, type=int, metavar='N', help='number of data loading workers (default: 4)') parser.add_argument('--epochs', default=90, type=int, metavar='N', help='number of total epochs to run') @@ -236,14 +236,14 @@ def main_worker(gpu, ngpus_per_node, args): model = vgg16() model = model.to(loc) - optimizer = torch.optim.SGD(model.parameters(), args.lr, + optimizer = apex.optimizers.NpuFusedSGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion = nn.CrossEntropyLoss().to(loc) if args.amp: - model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale_value) + model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale_value,combine_grad=True) #model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False) # optionally resume from a checkpoint -- Gitee From e26b62f8037a6961b3a0b426db17c2bd7aa87002 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com> Date: Fri, 25 Mar 2022 02:45:00 +0000 Subject: [PATCH 03/12] update --- .../contrib/cv/classification/Vgg16_ID1630_for_PyTorch/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/contrib/cv/classification/Vgg16_ID1630_for_PyTorch/main.py b/PyTorch/contrib/cv/classification/Vgg16_ID1630_for_PyTorch/main.py index c85de5b578..edb433043c 100644 --- a/PyTorch/contrib/cv/classification/Vgg16_ID1630_for_PyTorch/main.py +++ b/PyTorch/contrib/cv/classification/Vgg16_ID1630_for_PyTorch/main.py @@ -77,7 +77,7 @@ import numpy as np parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') parser.add_argument('--data', metavar='DIR', default='/dataset/imagenet', help='path to dataset') -parser.add_argument('-j', '--workers', default=128, type=int, metavar='N', +parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', help='number of data loading workers (default: 4)') parser.add_argument('--epochs', default=90, type=int, metavar='N', help='number of total epochs to run') -- Gitee From a046e31387ea7e5e058719d5ab513ff5f3debcfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com> Date: Fri, 25 Mar 2022 03:34:30 +0000 Subject: [PATCH 04/12] update --- .../image_classification/ResNeXt50_ID0419_for_PyTorch/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/dev/cv/image_classification/ResNeXt50_ID0419_for_PyTorch/train.py b/PyTorch/dev/cv/image_classification/ResNeXt50_ID0419_for_PyTorch/train.py index 738b696b2a..5e2d40f347 100644 --- a/PyTorch/dev/cv/image_classification/ResNeXt50_ID0419_for_PyTorch/train.py +++ b/PyTorch/dev/cv/image_classification/ResNeXt50_ID0419_for_PyTorch/train.py @@ -197,7 +197,7 @@ def main(args): data_loader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, - sampler=train_sampler, num_workers=args.workers, pin_memory=True) + sampler=train_sampler, num_workers=args.workers, pin_memory=False) data_loader_test = torch.utils.data.DataLoader( dataset_test, batch_size=args.batch_size, -- Gitee From 4910187f9855767055df0fd9cf5ba2912b48ec1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com> Date: Fri, 25 Mar 2022 03:35:56 +0000 Subject: [PATCH 05/12] update --- .../ResNeXt50_ID0419_for_PyTorch/train.py | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/PyTorch/dev/cv/image_classification/ResNeXt50_ID0419_for_PyTorch/train.py b/PyTorch/dev/cv/image_classification/ResNeXt50_ID0419_for_PyTorch/train.py index 5e2d40f347..51a6d96a42 100644 --- a/PyTorch/dev/cv/image_classification/ResNeXt50_ID0419_for_PyTorch/train.py +++ b/PyTorch/dev/cv/image_classification/ResNeXt50_ID0419_for_PyTorch/train.py @@ -60,18 +60,19 @@ def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, pri header = 'Epoch: [{}]'.format(epoch) cnt = 0 for image, target in metric_logger.log_every(data_loader, print_freq, header): - start_time = time.time() - image, target = image.to(device), target.to(torch.int).to(device) - output = model(image) - loss = criterion(output, target) - - optimizer.zero_grad() - if apex: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() - optimizer.step() + with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True): + start_time = time.time() + image, target = image.to(device), target.to(torch.int).to(device) + output = model(image) + loss = criterion(output, target) + + optimizer.zero_grad() + if apex: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + optimizer.step() acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) batch_size = image.shape[0] @@ -81,7 +82,8 @@ def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, pri metric_logger.meters['img/s'].update(batch_size / (time.time() - start_time)) cnt = cnt + 1 - if args.max_steps and cnt > args.max_steps: + # if args.max_steps and cnt > args.max_steps: + if cnt > 10: break -- Gitee From a239db0c687c5bcc4e38299592406ce4b231dfc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com> Date: Fri, 25 Mar 2022 03:36:59 +0000 Subject: [PATCH 06/12] update --- .../ResNeXt50_ID0419_for_PyTorch/train.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/PyTorch/dev/cv/image_classification/ResNeXt50_ID0419_for_PyTorch/train.py b/PyTorch/dev/cv/image_classification/ResNeXt50_ID0419_for_PyTorch/train.py index 51a6d96a42..0ed86f1906 100644 --- a/PyTorch/dev/cv/image_classification/ResNeXt50_ID0419_for_PyTorch/train.py +++ b/PyTorch/dev/cv/image_classification/ResNeXt50_ID0419_for_PyTorch/train.py @@ -60,19 +60,19 @@ def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, pri header = 'Epoch: [{}]'.format(epoch) cnt = 0 for image, target in metric_logger.log_every(data_loader, print_freq, header): - with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True): - start_time = time.time() - image, target = image.to(device), target.to(torch.int).to(device) - output = model(image) - loss = criterion(output, target) - - optimizer.zero_grad() - if apex: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() - optimizer.step() + # with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True): + start_time = time.time() + image, target = image.to(device), target.to(torch.int).to(device) + output = model(image) + loss = criterion(output, target) + + optimizer.zero_grad() + if apex: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + optimizer.step() acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) batch_size = image.shape[0] @@ -82,8 +82,8 @@ def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, pri metric_logger.meters['img/s'].update(batch_size / (time.time() - start_time)) cnt = cnt + 1 - # if args.max_steps and cnt > args.max_steps: - if cnt > 10: + if args.max_steps and cnt > args.max_steps: + # if cnt > 10: break -- Gitee From ec065991d6e7739ba630ea60276a727c739ff2eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com> Date: Fri, 25 Mar 2022 06:16:14 +0000 Subject: [PATCH 07/12] update --- .../cv/image_classification/VGG16_ID0467_for_PyTorch/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PyTorch/dev/cv/image_classification/VGG16_ID0467_for_PyTorch/train.py b/PyTorch/dev/cv/image_classification/VGG16_ID0467_for_PyTorch/train.py index 4e9f258d82..0816884f66 100644 --- a/PyTorch/dev/cv/image_classification/VGG16_ID0467_for_PyTorch/train.py +++ b/PyTorch/dev/cv/image_classification/VGG16_ID0467_for_PyTorch/train.py @@ -62,7 +62,7 @@ def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, pri for image, target in metric_logger.log_every(data_loader, print_freq, header): start_time = time.time() #image, target = image.to(device), target.to(device) - image, target = image.to(device), target.to(torch.int).to(device) + image, target = image.to(device,non_blocking=True), target.to(torch.int).to(device,non_blocking=True) output = model(image) loss = criterion(output, target) @@ -199,7 +199,7 @@ def main(args): data_loader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, - sampler=train_sampler, num_workers=args.workers, pin_memory=True) + sampler=train_sampler, num_workers=args.workers, pin_memory=True,drop_last=True) data_loader_test = torch.utils.data.DataLoader( dataset_test, batch_size=args.batch_size, -- Gitee From f1b62538031e2b1bee42ccab2da02475660ae4c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com> Date: Fri, 25 Mar 2022 06:32:26 +0000 Subject: [PATCH 08/12] update --- .../VGG16_ID0467_for_PyTorch/train.py | 34 ++++++++++++------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/PyTorch/dev/cv/image_classification/VGG16_ID0467_for_PyTorch/train.py b/PyTorch/dev/cv/image_classification/VGG16_ID0467_for_PyTorch/train.py index 0816884f66..a59c3e6f84 100644 --- a/PyTorch/dev/cv/image_classification/VGG16_ID0467_for_PyTorch/train.py +++ b/PyTorch/dev/cv/image_classification/VGG16_ID0467_for_PyTorch/train.py @@ -60,19 +60,27 @@ def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, pri header = 'Epoch: [{}]'.format(epoch) cnt = 0 for image, target in metric_logger.log_every(data_loader, print_freq, header): - start_time = time.time() - #image, target = image.to(device), target.to(device) - image, target = image.to(device,non_blocking=True), target.to(torch.int).to(device,non_blocking=True) - output = model(image) - loss = criterion(output, target) - - optimizer.zero_grad() - if apex: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() - optimizer.step() + with torch.autograd.profiler.profile(use_npu=True) as prof: + start_time = time.time() + #image, target = image.to(device), target.to(device) + image, target = image.to(device,non_blocking=True), target.to(torch.int).to(device,non_blocking=True) + output = model(image) + loss = criterion(output, target) + + optimizer.zero_grad() + if apex: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + optimizer.step() + cnt = str(cnt) + prof.export_chrome_trace("./profiler_"+cnt+"_npu.json") + cnt = int(cnt) + print('successfully_training') + if cnt==10: + sys.exit() + acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) batch_size = image.shape[0] -- Gitee From 8d2363568ea069a5b7453dd388c8a059e320dfb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com> Date: Fri, 25 Mar 2022 06:37:21 +0000 Subject: [PATCH 09/12] update --- .../VGG16_ID0467_for_PyTorch/train.py | 41 ++++++++++--------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/PyTorch/dev/cv/image_classification/VGG16_ID0467_for_PyTorch/train.py b/PyTorch/dev/cv/image_classification/VGG16_ID0467_for_PyTorch/train.py index a59c3e6f84..95363c0360 100644 --- a/PyTorch/dev/cv/image_classification/VGG16_ID0467_for_PyTorch/train.py +++ b/PyTorch/dev/cv/image_classification/VGG16_ID0467_for_PyTorch/train.py @@ -60,26 +60,26 @@ def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, pri header = 'Epoch: [{}]'.format(epoch) cnt = 0 for image, target in metric_logger.log_every(data_loader, print_freq, header): - with torch.autograd.profiler.profile(use_npu=True) as prof: - start_time = time.time() - #image, target = image.to(device), target.to(device) - image, target = image.to(device,non_blocking=True), target.to(torch.int).to(device,non_blocking=True) - output = model(image) - loss = criterion(output, target) - - optimizer.zero_grad() - if apex: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() - optimizer.step() - cnt = str(cnt) - prof.export_chrome_trace("./profiler_"+cnt+"_npu.json") - cnt = int(cnt) - print('successfully_training') - if cnt==10: - sys.exit() + # with torch.autograd.profiler.profile(use_npu=True) as prof: + start_time = time.time() + #image, target = image.to(device), target.to(device) + image, target = image.to(device,non_blocking=True), target.to(torch.int).to(device,non_blocking=True) + output = model(image) + loss = criterion(output, target) + + optimizer.zero_grad() + if apex: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + optimizer.step() + # cnt = str(cnt) + # prof.export_chrome_trace("./profiler_"+cnt+"_npu.json") + # cnt = int(cnt) + # print('successfully_training') + # if cnt==10: + # sys.exit() acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) @@ -228,6 +228,7 @@ def main(args): if args.apex: model, optimizer = amp.initialize(model, optimizer, opt_level=args.apex_opt_level, + loss_scale=128, combine_grad=True ) -- Gitee From 4edda00373dad425fdc7004a2936f49baab7b37e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com> Date: Fri, 25 Mar 2022 07:11:33 +0000 Subject: [PATCH 10/12] update --- .../cv/image_classification/VGG16_ID0467_for_PyTorch/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/dev/cv/image_classification/VGG16_ID0467_for_PyTorch/train.py b/PyTorch/dev/cv/image_classification/VGG16_ID0467_for_PyTorch/train.py index 95363c0360..2776b368b2 100644 --- a/PyTorch/dev/cv/image_classification/VGG16_ID0467_for_PyTorch/train.py +++ b/PyTorch/dev/cv/image_classification/VGG16_ID0467_for_PyTorch/train.py @@ -207,7 +207,7 @@ def main(args): data_loader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, - sampler=train_sampler, num_workers=args.workers, pin_memory=True,drop_last=True) + sampler=train_sampler, num_workers=args.workers, pin_memory=True) data_loader_test = torch.utils.data.DataLoader( dataset_test, batch_size=args.batch_size, -- Gitee From 4562260081d629a2a7dff758c3b7decc3cd08529 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com> Date: Fri, 25 Mar 2022 08:19:33 +0000 Subject: [PATCH 11/12] updat --- .../image_classification/training.py | 53 ++++++++++--------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/PyTorch/dev/cv/image_classification/SEResNext_ID0415_for_PyTorch/image_classification/training.py b/PyTorch/dev/cv/image_classification/SEResNext_ID0415_for_PyTorch/image_classification/training.py index 86db6ff76f..c31035a445 100644 --- a/PyTorch/dev/cv/image_classification/SEResNext_ID0415_for_PyTorch/image_classification/training.py +++ b/PyTorch/dev/cv/image_classification/SEResNext_ID0415_for_PyTorch/image_classification/training.py @@ -274,34 +274,35 @@ def get_train_step( model_and_loss, optimizer, fp16, use_amp=False, batch_size_multiplier=1 ): def _step(input, target, optimizer_step=True): - input_var = Variable(input) - target_var = Variable(target) - loss, output = model_and_loss(input_var, target_var) - if torch.distributed.is_initialized(): - reduced_loss = utils.reduce_tensor(loss.data) - else: - reduced_loss = loss.data + with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True): + input_var = Variable(input) + target_var = Variable(target) + loss, output = model_and_loss(input_var, target_var) + if torch.distributed.is_initialized(): + reduced_loss = utils.reduce_tensor(loss.data) + else: + reduced_loss = loss.data - if fp16: - optimizer.backward(loss) - elif use_amp: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() + if fp16: + optimizer.backward(loss) + elif use_amp: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() - if optimizer_step: - opt = ( - optimizer.optimizer - if isinstance(optimizer, FP16_Optimizer) - else optimizer - ) - for param_group in opt.param_groups: - for param in param_group["params"]: - param.grad /= batch_size_multiplier + if optimizer_step: + opt = ( + optimizer.optimizer + if isinstance(optimizer, FP16_Optimizer) + else optimizer + ) + for param_group in opt.param_groups: + for param in param_group["params"]: + param.grad /= batch_size_multiplier - optimizer.step() - optimizer.zero_grad() + optimizer.step() + optimizer.zero_grad() #torch.cuda.synchronize() torch.npu.synchronize() @@ -376,7 +377,7 @@ def train( data_iter = utils.first_n(prof, data_iter) for i, (input, target) in data_iter: - if i == 300: + if i == 10: pass bs = input.size(0) lr_scheduler(optimizer, i, epoch) -- Gitee From d11a4fe2598ed48e4ac826b37d3a43c8890d548d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com> Date: Fri, 25 Mar 2022 09:03:58 +0000 Subject: [PATCH 12/12] update --- .../image_classification/training.py | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/PyTorch/dev/cv/image_classification/SEResNext_ID0415_for_PyTorch/image_classification/training.py b/PyTorch/dev/cv/image_classification/SEResNext_ID0415_for_PyTorch/image_classification/training.py index c31035a445..ab4fe76d66 100644 --- a/PyTorch/dev/cv/image_classification/SEResNext_ID0415_for_PyTorch/image_classification/training.py +++ b/PyTorch/dev/cv/image_classification/SEResNext_ID0415_for_PyTorch/image_classification/training.py @@ -274,35 +274,35 @@ def get_train_step( model_and_loss, optimizer, fp16, use_amp=False, batch_size_multiplier=1 ): def _step(input, target, optimizer_step=True): - with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True): - input_var = Variable(input) - target_var = Variable(target) - loss, output = model_and_loss(input_var, target_var) - if torch.distributed.is_initialized(): - reduced_loss = utils.reduce_tensor(loss.data) - else: - reduced_loss = loss.data + # with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True): + input_var = Variable(input) + target_var = Variable(target) + loss, output = model_and_loss(input_var, target_var) + if torch.distributed.is_initialized(): + reduced_loss = utils.reduce_tensor(loss.data) + else: + reduced_loss = loss.data - if fp16: - optimizer.backward(loss) - elif use_amp: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() + if fp16: + optimizer.backward(loss) + elif use_amp: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() - if optimizer_step: - opt = ( - optimizer.optimizer - if isinstance(optimizer, FP16_Optimizer) - else optimizer - ) - for param_group in opt.param_groups: - for param in param_group["params"]: - param.grad /= batch_size_multiplier + if optimizer_step: + opt = ( + optimizer.optimizer + if isinstance(optimizer, FP16_Optimizer) + else optimizer + ) + for param_group in opt.param_groups: + for param in param_group["params"]: + param.grad /= batch_size_multiplier - optimizer.step() - optimizer.zero_grad() + optimizer.step() + optimizer.zero_grad() #torch.cuda.synchronize() torch.npu.synchronize() @@ -377,7 +377,7 @@ def train( data_iter = utils.first_n(prof, data_iter) for i, (input, target) in data_iter: - if i == 10: + if i == 300: pass bs = input.size(0) lr_scheduler(optimizer, i, epoch) -- Gitee