diff --git a/PyTorch/contrib/cv/classification/LVVIT/LICENSE b/PyTorch/contrib/cv/classification/LVVIT/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..6c1cbb5efebe718b26faa414d1835a92a47c5f0a --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2021 Zihang Jiang + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/PyTorch/contrib/cv/classification/LVVIT/README.md b/PyTorch/contrib/cv/classification/LVVIT/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0bccf23ce933350457a781753ec4254e1592f53f --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/README.md @@ -0,0 +1,92 @@ + + +## LV-ViT + +All Tokens Matter: Token Labeling for Training Better Vision Transformers ,based Transformer model for image classification, detail in ([arxiv](https://arxiv.org/abs/2104.10858)) + +## Requirements + +torch>=1.4.0 +torchvision>=0.5.0 +pyyaml +scipy +timm==0.4.5 + +data prepare: ImageNet with the following folder structure + +``` +│imagenet/ +├──train/ +│ ├── n01440764 +│ │ ├── n01440764_10026.JPEG +│ │ ├── n01440764_10027.JPEG +│ │ ├── ...... +│ ├── ...... +├──val/ +│ ├── n01440764 +│ │ ├── ILSVRC2012_val_00000293.JPEG +│ │ ├── ILSVRC2012_val_00002138.JPEG +│ │ ├── ...... +│ ├── ...... +``` + +## Label generation + +To generate token label data for training: + +```bash +python3 generate_label.py /path/to/imagenet/train /path/to/save/label_top5_train_nfnet --model dm_nfnet_f6 --pretrained --img-size 576 -b 32 --crop-pct 1.0 +``` + +also provided genarated labeled date in [BaiDu Yun](https://pan.baidu.com/s/1YBqiNN9dAzhEXtPl61bZJw) (password: y6j2) + +## Model Train + +Train the LV-ViT-S: + +```python +1:train on 1 NPU +bash /test/train_full_1p.sh '/Path_to_Imagenet' 'Path_to_Token-label-data' +Example: bash /test/train_full_1p.sh '/opt/npu/imagenet/' './label_top5_train_nfnet' + +2:train on 8 NPU +bash /test/train_full_8p.sh '/Path_to_Imagenet' 'Path_to_Token-label-data' +Example: bash /test/train_full_8p.sh '/opt/npu/imagenet/' './label_top5_train_nfnet' +``` + +Get model performance + +```python +1:test 1p performance +bash test/train_performance_1p.sh '/Path_to_Imagenet/' '/Path_to_Token-label-data/' +Example: bash test/train_performance_1p.sh '/opt/npu/imagenet/' './label_top5_train_nfnet' +2:test 8p performance +bash test/train_performance_8p.sh '/Path_to_Imagenet/' '/Path_to_Token-label-data/' +Example: bash test/train_performance_8p.sh '/opt/npu/imagenet/' './label_top5_train_nfnet' +``` + +## Validation + +Replace DATA_DIR with your imagenet validation set path and MODEL_DIR with the checkpoint path +```python +bash test/train_eval_8p.sh '/PATHTO/imagenet/val' '/PATHTO/LVVIT/eval_pth' +Example:test/train_eval_8p.sh '/opt/npu/imagenet/val' '/trained/model.pth.tar' +``` + +## Fine-tuning + +To Fine-tune the pre-trained LV-ViT-S +```python +bash /test/train_finetune_1p.sh '/Path_to_Imagenet/' '/Path_to_Token-label-data/' '/Pah_to_Trained_pth/' +Example: bash /test/train_full_1p.sh '/opt/npu/imagenet/' './label_top5_train_nfnet' './finetune/lvvit_s-26m-224-83.3.pth.tar' +``` + + + +## About Train FPS + +```yaml +Example log:Train: 257 [ 150/625 ( 24%)] Loss: 9.841134 (10.1421) Time: 1.941s, 1054.88/s (2.048s, 1000.09/s) LR: 4.609e-04 Data: 0.029 (0.062) +As log above get FPS:1054.88 +``` + diff --git a/PyTorch/contrib/cv/classification/LVVIT/distributed_train.sh b/PyTorch/contrib/cv/classification/LVVIT/distributed_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..36196ea622f7bd832ded0b7ff348b71d1127ece3 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/distributed_train.sh @@ -0,0 +1,5 @@ +#!/bin/bash +NUM_PROC=$1 +shift +python3 -m torch.distributed.launch --nproc_per_node=$NUM_PROC main.py "$@" + diff --git a/PyTorch/contrib/cv/classification/LVVIT/flops_computation.py b/PyTorch/contrib/cv/classification/LVVIT/flops_computation.py new file mode 100644 index 0000000000000000000000000000000000000000..76b71f8cd79c8ccfbe95fd82e38161fb7796fc9e --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/flops_computation.py @@ -0,0 +1,35 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +import tlt.models +# summary of model flops and parameters + +model_list = [tlt.models.lvvit_s, + tlt.models.lvvit_m, + tlt.models.lvvit_l] + +img_size_list=[224,288,384,448] + +for img_size in img_size_list: + for model_name in model_list: + model = model_name(img_size=img_size) + params = sum([m.numel() for m in model.parameters()]) + flops = model.patch_embed.flops() + for blk in model.blocks: + flops = flops + blk.flops(model.patch_embed.num_patches+1) + print("model: {}, img_size:{},\nparams:{:.2f} M, flops: {:.2f} G \n".format(model_name.__name__, img_size, params/1e6, flops/1e9)) + + print('-----------------------') \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LVVIT/generate_label.py b/PyTorch/contrib/cv/classification/LVVIT/generate_label.py new file mode 100644 index 0000000000000000000000000000000000000000..ad9131147dda5049271621f708157d5c087f4859 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/generate_label.py @@ -0,0 +1,342 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +#!/usr/bin/env python3 +""" +Script to generate token label. +Adapted from https://github.com/rwightman/pytorch-image-models +""" +import argparse +import os +import csv +import glob +import time +import logging +import torch +import torch.nn as nn +import torch.nn.parallel +from collections import OrderedDict +from contextlib import suppress +import numpy as np + +from timm.models import create_model, load_checkpoint, is_model, list_models +from timm.data import create_dataset, create_loader, resolve_data_config +from timm.utils import accuracy, AverageMeter, natural_key, setup_default_logging, set_jit_legacy +from PIL import Image + +from timm.data import ImageDataset +import logging + +_logger = logging.getLogger(__name__) + + +_ERROR_RETRY = 50 + +has_apex = False +try: + from apex import amp + has_apex = True +except ImportError: + pass + +has_native_amp = False +try: + if getattr(torch.cuda.amp, 'autocast') is not None: + has_native_amp = True +except AttributeError: + pass + +torch.backends.cudnn.benchmark = True +_logger = logging.getLogger('labeling') + + +parser = argparse.ArgumentParser(description='Generate token label') +parser.add_argument('data', metavar='DIR', + help='path to dataset') +parser.add_argument('token_label_root', metavar='DIR', + help='path to dataset') +parser.add_argument('--dataset', '-d', metavar='NAME', default='', + help='dataset type (default: ImageFolder/ImageTar if empty)') +parser.add_argument('--split', metavar='NAME', default='validation', + help='dataset split (default: validation)') +parser.add_argument('--model', '-m', metavar='NAME', default='dpn92', + help='model architecture (default: dpn92)') +parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', + help='number of data loading workers (default: 2)') +parser.add_argument('-b', '--batch-size', default=256, type=int, + metavar='N', help='mini-batch size (default: 256)') +parser.add_argument('--img-size', default=None, type=int, + metavar='N', help='Input image dimension, uses model default if empty') +parser.add_argument('--input-size', default=None, nargs=3, type=int, + metavar='N N N', help='Input all image dimensions (d h w, e.g. --input-size 3 224 224), uses model default if empty') +parser.add_argument('--crop-pct', default=None, type=float, + metavar='N', help='Input image center crop pct') +parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN', + help='Override mean pixel value of dataset') +parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD', + help='Override std deviation of of dataset') +parser.add_argument('--interpolation', default='', type=str, metavar='NAME', + help='Image resize interpolation type (overrides model)') +parser.add_argument('--num-classes', type=int, default=None, + help='Number classes in dataset') +parser.add_argument('--class-map', default='', type=str, metavar='FILENAME', + help='path to class to idx mapping file (default: "")') +parser.add_argument('--log-freq', default=10, type=int, + metavar='N', help='batch logging frequency (default: 10)') +parser.add_argument('--checkpoint', default='', type=str, metavar='PATH', + help='path to latest checkpoint (default: none)') +parser.add_argument('--pretrained', dest='pretrained', action='store_true', + help='use pre-trained model') +parser.add_argument('--num-gpu', type=int, default=1, + help='Number of GPUS to use') +parser.add_argument('--pin-mem', action='store_true', default=False, + help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') +parser.add_argument('--channels-last', action='store_true', default=False, + help='Use channels_last memory layout') +parser.add_argument('--amp', action='store_true', default=False, + help='Use AMP mixed precision. Defaults to Apex, fallback to native Torch AMP.') +parser.add_argument('--apex-amp', action='store_true', default=False, + help='Use NVIDIA Apex AMP mixed precision') +parser.add_argument('--native-amp', action='store_true', default=False, + help='Use Native Torch AMP mixed precision') +parser.add_argument('--tf-preprocessing', action='store_true', default=False, + help='Use Tensorflow preprocessing pipeline (require CPU TF installed') +parser.add_argument('--use-ema', dest='use_ema', action='store_true', + help='use ema version of weights if present') +parser.add_argument('--torchscript', dest='torchscript', action='store_true', + help='convert model torchscript for inference') +parser.add_argument('--legacy-jit', dest='legacy_jit', action='store_true', + help='use legacy jit mode for pytorch 1.5/1.5.1/1.6 to get back fusion performance') +parser.add_argument('--transfer', action='store_true', default=False, + help='disable evaluation due to dataset mismatch. Can be used to generate label for other dataset using imagenet pre-trained model') + +class ImageDatasetWithIndex(ImageDataset): + + def __getitem__(self, index): + img, target = self.parser[index] + try: + img = img.read() if self.load_bytes else Image.open(img).convert('RGB') + except Exception as e: + _logger.warning(f'Skipped sample (index {index}, file {self.parser.filename(index)}). {str(e)}') + self._consecutive_errors += 1 + if self._consecutive_errors < _ERROR_RETRY: + return self.__getitem__((index + 1) % len(self.parser)) + else: + raise e + self._consecutive_errors = 0 + if self.transform is not None: + img = self.transform(img) + if target is None: + target = torch.tensor(-1, dtype=torch.long) + return img, target, index + +class TokenLabelHead(nn.Module): + def __init__(self, base): + super(TokenLabelHead, self).__init__() + self.base = base + base_fc = self.base.get_classifier() + if hasattr(self.base, 'aux_head'): + base_fc = self.base.aux_head + if isinstance(base_fc, nn.Conv2d): + self.fc = base_fc + else: + self.fc = nn.Conv2d( + self.base.num_features, self.base.num_classes, kernel_size=1, bias=True) + self.fc.weight.data.copy_(base_fc.weight.data.view(self.fc.weight.size())) + self.fc.bias.data.copy_(base_fc.bias.data.view(self.fc.bias.size())) + self.base.reset_classifier(0) # delete original fc layer + + def forward(self, x): + x = self.base.forward_features(x) + + if len(x.shape)==3: + # reshape for ViT like token based models + B,N,C = x.shape + H = int(N**0.5) + if N==H*H+1: + # remove cls token + x = x[:,1:] + x = x.transpose(1,2).reshape(B,C,H,H) + else: + assert x.shape[2]==x.shape[3], 'shape should be B,C,H,H' + + x = self.fc(x) + x = x.permute(0,2,3,1) + return x + + + +def validate(args): + # might as well try to validate something + args.pretrained = args.pretrained or not args.checkpoint + args.prefetcher = False + amp_autocast = suppress # do nothing + if args.amp: + if has_native_amp: + args.native_amp = True + elif has_apex: + args.apex_amp = True + else: + _logger.warning("Neither APEX or Native Torch AMP is available.") + assert not args.apex_amp or not args.native_amp, "Only one AMP mode should be set." + if args.native_amp: + amp_autocast = torch.cuda.amp.autocast + _logger.info('Generating label in mixed precision with native PyTorch AMP.') + elif args.apex_amp: + _logger.info('Generating label in mixed precision with NVIDIA APEX AMP.') + else: + _logger.info('Generating label in float32. AMP not enabled.') + + if args.legacy_jit: + set_jit_legacy() + + # create model + model = create_model( + args.model, + pretrained=args.pretrained, + num_classes=args.num_classes, + in_chans=3, + scriptable=args.torchscript) + if args.num_classes is None: + assert hasattr(model, 'num_classes'), 'Model must have `num_classes` attr if not set on cmd line/config.' + args.num_classes = model.num_classes + + if args.checkpoint: + load_checkpoint(model, args.checkpoint, args.use_ema) + + param_count = sum([m.numel() for m in model.parameters()]) + _logger.info('Model %s created, param count: %d' % (args.model, param_count)) + + data_config = resolve_data_config(vars(args), model=model, use_test_size=True) + model = TokenLabelHead(model) + if args.torchscript: + torch.jit.optimized_execution(True) + model = torch.jit.script(model) + + model = model.cuda() + if args.apex_amp: + model = amp.initialize(model, opt_level='O1') + + if args.channels_last: + model = model.to(memory_format=torch.channels_last) + + if args.num_gpu > 1: + model = torch.nn.DataParallel(model, device_ids=list(range(args.num_gpu))) + + criterion = nn.CrossEntropyLoss().cuda() + + dataset = ImageDatasetWithIndex(args.data, parser=args.dataset, + load_bytes=args.tf_preprocessing, class_map=args.class_map) + + + + crop_pct = data_config['crop_pct'] + loader = create_loader( + dataset, + input_size=data_config['input_size'], + batch_size=args.batch_size, + use_prefetcher=args.prefetcher, + interpolation=data_config['interpolation'], + mean=data_config['mean'], + std=data_config['std'], + num_workers=args.workers, + crop_pct=crop_pct, + pin_memory=args.pin_mem, + tf_preprocessing=args.tf_preprocessing,) + + batch_time = AverageMeter() + losses = AverageMeter() + top1 = AverageMeter() + top5 = AverageMeter() + os.makedirs(args.token_label_root, exist_ok=True) + model.eval() + with torch.no_grad(): + # warmup, reduce variability of first batch time, especially for comparing torchscript vs non + input = torch.randn((args.batch_size,) + data_config['input_size']).cuda() + if args.channels_last: + input = input.contiguous(memory_format=torch.channels_last) + model(input) + end = time.time() + for batch_idx, (input, target,idxs) in enumerate(loader): + target = target.cuda() + input = input.cuda() + if args.channels_last: + input = input.contiguous(memory_format=torch.channels_last) + # compute output + output = model(input) + value, indices = output.topk(5) + for i in range(input.shape[0]): + path = dataset.parser[idxs[i]][0].name + score_path = os.path.join(args.token_label_root, + '/'.join(path.split('/')[-2:]).split('.')[0] + '.pt') + score_dict = os.path.join(args.token_label_root,path.split('/')[-2]) + os.makedirs(score_dict,exist_ok=True) + # save top 5 value and index with shape [2, 5, H, W] + torch.save(torch.stack([value[i].cpu().half(),indices[i].cpu().half()]).permute(0,3,1,2),score_path) + + output = output.mean((1,2)) + if args.transfer: + # do not record loss and acc + loss = torch.sum(output-output) + acc1, acc5 = loss, loss + + else: + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output.detach(), target, topk=(1, 5)) + losses.update(loss.item(), input.size(0)) + top1.update(acc1.item(), input.size(0)) + top5.update(acc5.item(), input.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if batch_idx % args.log_freq == 0: + _logger.info( + 'Test: [{0:>4d}/{1}] ' + 'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' + 'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f}) ' + 'Acc@1: {top1.val:>7.3f} ({top1.avg:>7.3f}) ' + 'Acc@5: {top5.val:>7.3f} ({top5.avg:>7.3f})'.format( + batch_idx, len(loader), batch_time=batch_time, + rate_avg=input.size(0) / batch_time.avg, + loss=losses, top1=top1, top5=top5)) + + top1a, top5a = top1.avg, top5.avg + results = OrderedDict( + top1=round(top1a, 4), top1_err=round(100 - top1a, 4), + top5=round(top5a, 4), top5_err=round(100 - top5a, 4), + param_count=round(param_count / 1e6, 2), + img_size=data_config['input_size'][-1], + cropt_pct=crop_pct, + interpolation=data_config['interpolation']) + + _logger.info(' * Acc@1 {:.3f} ({:.3f}) Acc@5 {:.3f} ({:.3f})'.format( + results['top1'], results['top1_err'], results['top5'], results['top5_err'])) + + return results + + +def main(): + setup_default_logging() + args = parser.parse_args() + validate(args) + + +if __name__ == '__main__': + main() diff --git a/PyTorch/contrib/cv/classification/LVVIT/main.py b/PyTorch/contrib/cv/classification/LVVIT/main.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5200bf5869a29c6b9d7893de2a5d1452a9ae38 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/main.py @@ -0,0 +1,883 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 +#!/usr/bin/env python3 +""" ImageNet Training Script +""" +import argparse +from ast import arg +import time +import yaml +import os +import logging +from collections import OrderedDict +from contextlib import suppress +from datetime import datetime +import numpy as np +import random +import torch +import torch.nn as nn +import torchvision.utils +from torch.nn.parallel import DistributedDataParallel as NativeDDP + +from tlt.data import create_loader +from timm.data import create_dataset, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset +from timm.models import create_model, resume_checkpoint, load_checkpoint, convert_splitbn_model, model_parameters +from timm.utils import * +from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy +from timm.optim import create_optimizer +from timm.scheduler import create_scheduler +from timm.utils import ApexScaler, NativeScaler + +import tlt.models +from tlt.data import create_token_label_target, TokenLabelMixup, FastCollateTokenLabelMixup, create_token_label_loader, create_token_label_dataset +from tlt.loss import TokenLabelCrossEntropy, TokenLabelSoftTargetCrossEntropy +from tlt.utils import load_pretrained_weights + +import time +try: + from apex import amp + from apex.parallel import DistributedDataParallel as ApexDDP + from apex.parallel import convert_syncbn_model + has_apex = True +except ImportError: + has_apex = False + +has_native_amp = False +try: + if getattr(torch.cuda.amp, 'autocast') is not None: + has_native_amp = True +except AttributeError: + pass + +torch.backends.cudnn.benchmark = True +_logger = logging.getLogger('train') + +# The first arg parser parses out only the --config argument, this argument is used to +# load a yaml file containing key-values that override the defaults for the main parser below +config_parser = parser = argparse.ArgumentParser(description='Training Config', add_help=False) +parser.add_argument('-c', '--config', default='', type=str, metavar='FILE', + help='YAML config file specifying default arguments') + + +parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') + +# Dataset / Model parameters +parser.add_argument('data_dir', metavar='DIR', + help='path to dataset') +parser.add_argument('--dataset', '-d', metavar='NAME', default='', + help='dataset type (default: ImageFolder/ImageTar if empty)') +parser.add_argument('--train-split', metavar='NAME', default='train', + help='dataset train split (default: train)') +parser.add_argument('--val-split', metavar='NAME', default='validation', + help='dataset validation split (default: validation)') +parser.add_argument('--model', default='lvvit', type=str, metavar='MODEL', + help='Name of model to train (default: "lvvit"') +parser.add_argument('--pretrained', action='store_true', default=False, + help='Start with pretrained version of specified network (if avail)') +parser.add_argument('--initial-checkpoint', default='', type=str, metavar='PATH', + help='Initialize model from this checkpoint (default: none)') +parser.add_argument('--resume', default='', type=str, metavar='PATH', + help='Resume full model and optimizer state from checkpoint (default: none)') +parser.add_argument('--no-resume-opt', action='store_true', default=False, + help='prevent resume of optimizer state when resuming model') +parser.add_argument('--num-classes', type=int, default=None, metavar='N', + help='number of label classes (Model default if None)') +parser.add_argument('--gp', default=None, type=str, metavar='POOL', + help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.') +parser.add_argument('--img-size', type=int, default=None, metavar='N', + help='Image patch size (default: None => model default)') +parser.add_argument('--input-size', default=None, nargs=3, type=int, + metavar='N N N', help='Input all image dimensions (d h w, e.g. --input-size 3 224 224), uses model default if empty') +parser.add_argument('--crop-pct', default=None, type=float, + metavar='N', help='Input image center crop percent (for validation only)') +parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN', + help='Override mean pixel value of dataset') +parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD', + help='Override std deviation of of dataset') +parser.add_argument('--interpolation', default='', type=str, metavar='NAME', + help='Image resize interpolation type (overrides model)') +parser.add_argument('-b', '--batch-size', type=int, default=32, metavar='N', + help='input batch size for training (default: 32)') +parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N', + help='ratio of validation batch size to training batch size (default: 1)') + +# Optimizer parameters +parser.add_argument('--opt', default='adamw', type=str, metavar='OPTIMIZER', + help='Optimizer (default: "adamw"') +parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', + help='Optimizer Epsilon (default: None, use opt default)') +parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', + help='Optimizer Betas (default: None, use opt default)') +parser.add_argument('--momentum', type=float, default=0.9, metavar='M', + help='Optimizer momentum (default: 0.9)') +parser.add_argument('--weight-decay', type=float, default=0.05, + help='weight decay (default: 0.05)') +parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM', + help='Clip gradient norm (default: None, no clipping)') +parser.add_argument('--clip-mode', type=str, default='norm', + help='Gradient clipping mode. One of ("norm", "value", "agc")') + + +# Learning rate schedule parameters +parser.add_argument('--sched', default='cosine', type=str, metavar='SCHEDULER', + help='LR scheduler (default: "cosine"') +parser.add_argument('--lr', type=float, default=1.6e-3, metavar='LR', + help='learning rate (default: 1.6e-3)') +parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct', + help='learning rate noise on/off epoch percentages') +parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT', + help='learning rate noise limit percent (default: 0.67)') +parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV', + help='learning rate noise std-dev (default: 1.0)') +parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT', + help='learning rate cycle len multiplier (default: 1.0)') +parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N', + help='learning rate cycle limit') +parser.add_argument('--warmup-lr', type=float, default=1e-6, metavar='LR', + help='warmup learning rate (default: 0.0001)') +parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR', + help='lower lr bound for cyclic schedulers that hit 0 (1e-5)') +parser.add_argument('--epochs', type=int, default=300, metavar='N', + help='number of epochs to train (default: 300)') +parser.add_argument('--start-epoch', default=None, type=int, metavar='N', + help='manual epoch number (useful on restarts)') +parser.add_argument('--decay-epochs', type=float, default=30, metavar='N', + help='epoch interval to decay LR') +parser.add_argument('--warmup-epochs', type=int, default=5, metavar='N', + help='epochs to warmup LR, if scheduler supports') +parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N', + help='epochs to cooldown LR at min_lr, after cyclic schedule ends') +parser.add_argument('--patience-epochs', type=int, default=10, metavar='N', + help='patience epochs for Plateau LR scheduler (default: 10') +parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE', + help='LR decay rate (default: 0.1)') + +# Augmentation & regularization parameters +parser.add_argument('--no-aug', action='store_true', default=False, + help='Disable all training augmentation, override other train aug args') +parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT', + help='Random resize scale (default: 0.08 1.0)') +parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO', + help='Random resize aspect ratio (default: 0.75 1.33)') +parser.add_argument('--hflip', type=float, default=0.5, + help='Horizontal flip training aug probability') +parser.add_argument('--vflip', type=float, default=0., + help='Vertical flip training aug probability') +parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT', + help='Color jitter factor (default: 0.4)') +parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME', + help='Use AutoAugment policy. "v0" or "original". (default: rand-m9-mstd0.5-inc1)'), +parser.add_argument('--aug-splits', type=int, default=0, + help='Number of augmentation splits (default: 0, valid: 0 or >=2)') +parser.add_argument('--jsd', action='store_true', default=False, + help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.') +parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT', + help='Random erase prob (default: 0.25)') +parser.add_argument('--remode', type=str, default='pixel', + help='Random erase mode (default: "pixel")') +parser.add_argument('--recount', type=int, default=1, + help='Random erase count (default: 1)') +parser.add_argument('--resplit', action='store_true', default=False, + help='Do not random erase first (clean) augmentation split') +parser.add_argument('--mixup', type=float, default=0.0, + help='mixup alpha, mixup enabled if > 0. (default: 0.)') +parser.add_argument('--cutmix', type=float, default=0.0, + help='cutmix alpha, cutmix enabled if > 0. (default: 0.)') +parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None, + help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)') +parser.add_argument('--mixup-prob', type=float, default=1.0, + help='Probability of performing mixup or cutmix when either/both is enabled') +parser.add_argument('--mixup-switch-prob', type=float, default=0.5, + help='Probability of switching to cutmix when both mixup and cutmix enabled') +parser.add_argument('--mixup-mode', type=str, default='batch', + help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"') +parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N', + help='Turn off mixup after this epoch, disabled if 0 (default: 0)') +parser.add_argument('--smoothing', type=float, default=0.1, + help='Label smoothing (default: 0.1)') +parser.add_argument('--train-interpolation', type=str, default='random', + help='Training interpolation (random, bilinear, bicubic default: "random")') +parser.add_argument('--drop', type=float, default=0.0, metavar='PCT', + help='Dropout rate (default: 0.)') +parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT', + help='Drop connect rate, DEPRECATED, use drop-path (default: None)') +parser.add_argument('--drop-path', type=float, default=None, metavar='PCT', + help='Drop path rate (default: None)') +parser.add_argument('--drop-block', type=float, default=None, metavar='PCT', + help='Drop block rate (default: None)') + +# Batch norm parameters (only works with gen_efficientnet based models currently) +parser.add_argument('--bn-tf', action='store_true', default=False, + help='Use Tensorflow BatchNorm defaults for models that support it (default: False)') +parser.add_argument('--bn-momentum', type=float, default=None, + help='BatchNorm momentum override (if not None)') +parser.add_argument('--bn-eps', type=float, default=None, + help='BatchNorm epsilon override (if not None)') +parser.add_argument('--sync-bn', action='store_true', + help='Enable NVIDIA Apex or Torch synchronized BatchNorm.') +parser.add_argument('--dist-bn', type=str, default='', + help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")') +parser.add_argument('--split-bn', action='store_true', + help='Enable separate BN layers per augmentation split.') + +# Model Exponential Moving Average +parser.add_argument('--model-ema', action='store_true', default=False, + help='Enable tracking moving average of model weights') +parser.add_argument('--model-ema-force-cpu', action='store_true', default=False, + help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.') +parser.add_argument('--model-ema-decay', type=float, default=0.99992, + help='decay factor for model weights moving average (default: 0.99992)') + +# Misc +parser.add_argument('--seed', type=int, default=42, metavar='S', + help='random seed (default: 42)') +parser.add_argument('--log-interval', type=int, default=50, metavar='N', + help='how many batches to wait before logging training status') +parser.add_argument('--recovery-interval', type=int, default=0, metavar='N', + help='how many batches to wait before writing recovery checkpoint') +parser.add_argument('--checkpoint-hist', type=int, default=10, metavar='N', + help='number of checkpoints to keep (default: 10)') +parser.add_argument('-j', '--workers', type=int, default=8, metavar='N', + help='how many training processes to use (default: 1)') +parser.add_argument('--save-images', action='store_true', default=False, + help='save images of input bathes every log interval for debugging') +parser.add_argument('--amp', action='store_true', default=False, + help='use NVIDIA Apex AMP or Native AMP for mixed precision training') +parser.add_argument('--apex-amp', action='store_true', default=False, + help='Use NVIDIA Apex AMP mixed precision') +parser.add_argument('--native-amp', action='store_true', default=False, + help='Use Native Torch AMP mixed precision') +parser.add_argument('--channels-last', action='store_true', default=False, + help='Use channels_last memory layout') +parser.add_argument('--pin-mem', action='store_true', default=False, + help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') +parser.add_argument('--no-prefetcher', action='store_true', default=False, + help='disable fast prefetcher') +parser.add_argument('--output', default='', type=str, metavar='PATH', + help='path to output folder (default: none, current dir)') +parser.add_argument('--eval-metric', default='top1', type=str, metavar='EVAL_METRIC', + help='Best metric (default: "top1"') +parser.add_argument('--tta', type=int, default=0, metavar='N', + help='Test/inference time augmentation (oversampling) factor. 0=None (default: 0)') +parser.add_argument("--local_rank", default=0, type=int) +parser.add_argument('--use-multi-epochs-loader', action='store_true', default=False, + help='use the multi-epochs-loader to save time at the beginning of every epoch') +parser.add_argument('--torchscript', dest='torchscript', action='store_true', + help='convert model torchscript for inference') + +# Token labeling + +parser.add_argument('--token-label', action='store_true', default=False, + help='Use dense token-level label map for training') +parser.add_argument('--token-label-data', type=str, default='', metavar='DIR', + help='path to token_label data') +parser.add_argument('--token-label-size', type=int, default=1, metavar='N', + help='size of result token label map') +parser.add_argument('--dense-weight', type=float, default=1.0, + help='Token labeling loss multiplier (default: 1.0)') +parser.add_argument('--cls-weight', type=float, default=1.0, + help='Cls token prediction loss multiplier (default: 1.0)') +parser.add_argument('--ground-truth', action='store_true', default=False, + help='Use ground truth label to help refine generated target label') + + +# Finetune +parser.add_argument('--finetune', default='', type=str, metavar='PATH', + help='path to checkpoint file (default: none)') +parser.add_argument("--device_id", help="device_id", default=2, type=int) +parser.add_argument("--opt_level", default='O1', type=str, + help='Choose an optimize level, default O2') +parser.add_argument("--loss_scale", default=None, type=int, + help='set loss scale') +parser.add_argument("--distributed", action='store_true', default=False, + help='if distributed') + + +def _parse_args(): + # Do we have a config file to parse? + args_config, remaining = config_parser.parse_known_args() + if args_config.config: + with open(args_config.config, 'r') as f: + cfg = yaml.safe_load(f) + parser.set_defaults(**cfg) + + # The main arg parser parses the rest of the args, the usual + # defaults will have been overridden if config file specified. + args = parser.parse_args(remaining) + + # Cache the args as a text string to save them in the output dir later + args_text = yaml.safe_dump(args.__dict__, default_flow_style=False) + return args, args_text + + +def main(): + import random + setup_default_logging() + args, args_text = _parse_args() + + os.environ['MASTER_ADDR'] = '127.0.0.1' + os.environ['MASTER_PORT'] = '29688' + + # 固定seed + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + os.environ['PYTHONHASHSEED'] = str(args.seed) + device = torch.device(f"npu:{args.device_id}") + args.device = device + torch.npu.set_device(device) + + args.prefetcher = not args.no_prefetcher + args.is_master_node = not args.distributed or args.device_id == 0 + + args.world_size = 1 + args.rank = 0 + if args.distributed: + args.world_size = int(os.environ['RANK_SIZE']) + args.rank = int(os.environ['RANK_ID']) + torch.distributed.init_process_group(backend='hccl', init_method='env://', world_size=args.world_size, rank=args.rank) + _logger.info('Training in distributed mode with multiple processes, 1 NPU per process. Process %d, total %d.' + % (args.rank, args.world_size)) + else: + _logger.info('Training with a single process on 1 GPUs.') + + # resolve AMP arguments based on PyTorch / Apex availability + use_amp = None + if args.amp: + # `--amp` chooses native amp before apex (APEX ver not actively maintained) + if has_native_amp: + args.native_amp = True + elif has_apex: + args.apex_amp = True + if args.apex_amp and has_apex: + use_amp = 'apex' + elif args.native_amp and has_native_amp: + use_amp = 'native' + elif args.apex_amp or args.native_amp: + _logger.warning("Neither APEX or native Torch AMP is available, using float32. " + "Install NVIDA apex or upgrade to PyTorch 1.6") + + model = create_model( + args.model, + pretrained=args.pretrained, + num_classes=args.num_classes, + drop_rate=args.drop, + drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path + drop_path_rate=args.drop_path, + drop_block_rate=args.drop_block, + global_pool=args.gp, + bn_tf=args.bn_tf, + bn_momentum=args.bn_momentum, + bn_eps=args.bn_eps, + scriptable=args.torchscript, + checkpoint_path=args.initial_checkpoint, + img_size=args.img_size) + if args.num_classes is None: + assert hasattr(model, 'num_classes'), 'Model must have `num_classes` attr if not set on cmd line/config.' + args.num_classes = model.num_classes # FIXME handle model default vs config num_classes more elegantly + + if args.finetune: + load_pretrained_weights(model=model,checkpoint_path=args.finetune,use_ema=args.model_ema, strict=False, num_classes=args.num_classes) + + if args.is_master_node: + _logger.info('Model %s created, param count: %d' % + (args.model, sum([m.numel() for m in model.parameters()]))) + + data_config = resolve_data_config(vars(args), model=model, verbose=args.is_master_node) + + # setup augmentation batch splits for contrastive loss or split bn + num_aug_splits = 0 + if args.aug_splits > 0: + assert args.aug_splits > 1, 'A split of 1 makes no sense' + num_aug_splits = args.aug_splits + + # enable split bn (separate bn stats per batch-portion) + if args.split_bn: + assert num_aug_splits > 1 or args.resplit + model = convert_splitbn_model(model, max(num_aug_splits, 2)) + + # move model to GPU, enable channels last layout if set + model.to(device) + + if args.channels_last: + model = model.to(memory_format=torch.channels_last) + + # setup synchronized BatchNorm for distributed training + if args.distributed and args.sync_bn: + assert not args.split_bn + if has_apex and use_amp != 'native': + # Apex SyncBN preferred unless native amp is activated + model = convert_syncbn_model(model) + else: + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) + if args.is_master_node: + _logger.info( + 'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using ' + 'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.') + + if args.torchscript: + assert not use_amp == 'apex', 'Cannot use APEX AMP with torchscripted model' + assert not args.sync_bn, 'Cannot use SyncBatchNorm with torchscripted model' + model = torch.jit.script(model) + + from apex.optimizers import NpuFusedAdam + optimizer = NpuFusedAdam(model.parameters(), lr=args.lr) + + # setup automatic mixed-precision (AMP) loss scaling and op casting + amp_autocast = suppress # do nothing + loss_scaler = None + optimizers=None + if use_amp == 'apex': + model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale, combine_grad=True) + loss_scaler = ApexScaler() + if args.is_master_node: + _logger.info('Using NVIDIA APEX AMP. Training in mixed precision.') + elif use_amp == 'native': + amp_autocast = torch.cuda.amp.autocast + loss_scaler = NativeScaler() + if args.is_master_node: + _logger.info('Using native Torch AMP. Training in mixed precision.') + else: + if args.is_master_node: + _logger.info('AMP not enabled. Training in float32.') + + # optionally resume from a checkpoint + resume_epoch = None + if args.resume: + resume_epoch = resume_checkpoint( + model, args.resume, + optimizer=None if args.no_resume_opt else optimizer, + loss_scaler=None if args.no_resume_opt else loss_scaler, + log_info="is_master_node") + + # setup exponential moving average of model weights, SWA could be used here too + model_ema = None + if args.model_ema: + # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper + model_ema = ModelEmaV2( + model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else None) + if args.resume: + load_checkpoint(model_ema.module, args.resume, use_ema=True) + + # setup distributed training + if args.distributed: + if args.is_master_node: + _logger.info("Using native Torch DistributedDataParallel.") + model = NativeDDP(model, device_ids=[args.device_id]) # can use device str in Torch >= 1.1 + # NOTE: EMA model does not need to be wrapped by DDP + + # setup learning rate schedule and starting epoch + lr_scheduler, num_epochs = create_scheduler(args, optimizer) + if args.epochs <= 5: + num_epochs = args.epochs + start_epoch = 0 + if args.start_epoch is not None: + # a specified start_epoch will always override the resume epoch + start_epoch = args.start_epoch + elif resume_epoch is not None: + start_epoch = resume_epoch + if lr_scheduler is not None and start_epoch > 0: + lr_scheduler.step(start_epoch) + + if args.is_master_node: + _logger.info('Scheduled epochs: {}'.format(num_epochs)) + + # create the train and eval datasets + + # create token_label dataset + if args.token_label_data: + dataset_train = create_token_label_dataset(args.dataset, root=args.data_dir, label_root=args.token_label_data) + else: + dataset_train = create_dataset( + args.dataset, root=args.data_dir, split=args.train_split, is_training=True, batch_size=args.batch_size) + dataset_eval = create_dataset( + args.dataset, root=args.data_dir, split=args.val_split, is_training=False, batch_size=args.batch_size) + + # setup mixup / cutmix + collate_fn = None + mixup_fn = None + mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None + if mixup_active: + mixup_args = dict( + mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, + prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, + label_smoothing=args.smoothing, num_classes=args.num_classes) + # create token_label mixup + if args.token_label_data: + mixup_args['label_size']=args.token_label_size + if args.prefetcher: + assert not num_aug_splits + collate_fn = FastCollateTokenLabelMixup(**mixup_args) + else: + mixup_fn = TokenLabelMixup(**mixup_args) + else: + if args.prefetcher: + assert not num_aug_splits # collate conflict (need to support deinterleaving in collate mixup) + collate_fn = FastCollateMixup(**mixup_args) + else: + mixup_fn = Mixup(**mixup_args) + + # wrap dataset in AugMix helper + if num_aug_splits > 1: + assert not args.token_label + dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits) + + # create data loaders w/ augmentation pipeiine + train_interpolation = args.train_interpolation + if args.no_aug or not train_interpolation: + train_interpolation = data_config['interpolation'] + if args.token_label and args.token_label_data: + use_token_label = True + else: + use_token_label = False + loader_train = create_token_label_loader( + device, + dataset_train, + input_size=data_config['input_size'], + batch_size=args.batch_size, + is_training=True, + use_prefetcher=args.prefetcher, + no_aug=args.no_aug, + re_prob=args.reprob, + re_mode=args.remode, + re_count=args.recount, + re_split=args.resplit, + scale=args.scale, + ratio=args.ratio, + hflip=args.hflip, + vflip=args.vflip, + color_jitter=args.color_jitter, + auto_augment=args.aa, + num_aug_splits=num_aug_splits, + interpolation=train_interpolation, + mean=data_config['mean'], + std=data_config['std'], + num_workers=args.workers, + distributed=args.distributed, + collate_fn=collate_fn, + pin_memory=args.pin_mem, + use_multi_epochs_loader=args.use_multi_epochs_loader, + use_token_label=use_token_label + ) + + loader_eval = create_loader( + device, + dataset_eval, + input_size=data_config['input_size'], + batch_size=args.validation_batch_size_multiplier * args.batch_size, + is_training=False, + use_prefetcher=args.prefetcher, + interpolation=data_config['interpolation'], + mean=data_config['mean'], + std=data_config['std'], + num_workers=args.workers, + distributed=args.distributed, + crop_pct=data_config['crop_pct'], + pin_memory=args.pin_mem, + ) + + # setup loss function + + # use token_label loss + if args.token_label: + if args.token_label_size == 1: + # back to relabel/original ImageNet label + train_loss_fn = TokenLabelSoftTargetCrossEntropy().to(device) + else: + train_loss_fn = TokenLabelCrossEntropy(dense_weight=args.dense_weight, \ + cls_weight=args.cls_weight, mixup_active=mixup_active, ground_truth=args.ground_truth).to(device) + + else: + # smoothing is handled with mixup target transform or create_token_label_target function + train_loss_fn = SoftTargetCrossEntropy().to(device) + + validate_loss_fn = nn.CrossEntropyLoss().to(device) + + # setup checkpoint saver and eval metric tracking + eval_metric = args.eval_metric + best_metric = None + best_epoch = None + saver = None + output_dir = '' + if args.is_master_node: + output_base = args.output if args.output else './output' + exp_name = '-'.join([ + datetime.now().strftime("%Y%m%d-%H%M%S"), + args.model, + str(data_config['input_size'][-1]) + ]) + output_dir = get_outdir(output_base, 'train', exp_name) + decreasing = True if eval_metric == 'loss' else False + saver = CheckpointSaver( + model=model, optimizer=optimizer, args=args, model_ema=model_ema, amp_scaler=loss_scaler, + checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=decreasing, max_history=args.checkpoint_hist) + with open(os.path.join(output_dir, 'args.yaml'), 'w') as f: + f.write(args_text) + + try: + if args.finetune: + validate(device, model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast) + for epoch in range(start_epoch, num_epochs): + if args.distributed and hasattr(loader_train.sampler, 'set_epoch'): + loader_train.sampler.set_epoch(epoch) + + train_metrics = train_one_epoch(device, + epoch, model, loader_train, optimizer, train_loss_fn, args, + lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, + amp_autocast=amp_autocast, loss_scaler=loss_scaler, model_ema=model_ema, mixup_fn=mixup_fn, optimizers=optimizers) + + #break + + if args.distributed and args.dist_bn in ('broadcast', 'reduce'): + if args.is_master_node: + _logger.info("Distributing BatchNorm running means and vars") + distribute_bn(model, args.world_size, args.dist_bn == 'reduce') + + eval_metrics = validate(device, model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast) + + if model_ema is not None and not args.model_ema_force_cpu: + if args.distributed and args.dist_bn in ('broadcast', 'reduce'): + distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce') + ema_eval_metrics = validate(device, + model_ema.module, loader_eval, validate_loss_fn, args, + amp_autocast=amp_autocast, + log_suffix=' (EMA)') + eval_metrics = ema_eval_metrics + + if lr_scheduler is not None: + # step LR for next epoch + lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) + + update_summary( + epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), + write_header=best_metric is None) + + if saver is not None: + # save proper checkpoint with eval metric + save_metric = eval_metrics[eval_metric] + best_metric, best_epoch = saver.save_checkpoint(epoch, metric=save_metric) + + except KeyboardInterrupt: + pass + if best_metric is not None: + _logger.info('*** Best metric: {0} (epoch {1})'.format(best_metric, best_epoch)) + + +def train_one_epoch( + device, + epoch, model, loader, optimizer, loss_fn, args, + lr_scheduler=None, saver=None, output_dir='', amp_autocast=suppress, + loss_scaler=None, model_ema=None, mixup_fn=None, optimizers = None): + + if args.mixup_off_epoch and epoch >= args.mixup_off_epoch: + if args.prefetcher and loader.mixup_enabled: + loader.mixup_enabled = False + elif mixup_fn is not None: + mixup_fn.mixup_enabled = False + + second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order + batch_time_m = AverageMeter() + data_time_m = AverageMeter() + losses_m = AverageMeter() + + model.train() + + end = time.time() + last_idx = len(loader) - 1 + num_updates = epoch * len(loader) + for batch_idx, (input, target) in enumerate(loader): + last_batch = batch_idx == last_idx + data_time_m.update(time.time() - end) + if not args.prefetcher: + input, target = input.to(device), target.to(device) + if mixup_fn is not None: + input, target = mixup_fn(input, target) + else: + # handle token_label without mixup + if args.token_label and args.token_label_data: + target=create_token_label_target(target,num_classes=args.num_classes, + smoothing=args.smoothing, label_size=args.token_label_size, device=device) + if len(target.shape)==1: + target=create_token_label_target(target,num_classes=args.num_classes, + smoothing=args.smoothing, device=device) + else: + if args.token_label and args.token_label_data and not loader.mixup_enabled: + target=create_token_label_target(target,num_classes=args.num_classes, + smoothing=args.smoothing, label_size=args.token_label_size, device=device) + if len(target.shape)==1: + target=create_token_label_target(target,num_classes=args.num_classes, + smoothing=args.smoothing, device=device) + if args.channels_last: + input = input.contiguous(memory_format=torch.channels_last) + # if batch_idx in [60, 61]: + # import pdb + # pdb.set_trace() + if batch_idx == 60: + with torch.autograd.profiler.profile(use_npu=True) as prof: + with amp_autocast(): + output = model(input) + loss = loss_fn(output, target) + + if not args.distributed: + losses_m.update(loss.item(), input.size(0)) + + optimizer.zero_grad() + + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward(create_graph=second_order) + + optimizer.step() + + prof.export_chrome_trace("output.prof") # "output.prof"为输出文件地址 + else: + with amp_autocast(): + output = model(input) + loss = loss_fn(output, target) + + if not args.distributed: + losses_m.update(loss.item(), input.size(0)) + + optimizer.zero_grad() + + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward(create_graph=second_order) + + optimizer.step() + + if model_ema is not None: + model_ema.update(model) + + torch.npu.synchronize() + num_updates += 1 + batch_time_m.update(time.time() - end) + # if last_batch or batch_idx % args.log_interval == 0: + if last_batch or batch_idx % 50 == 0: + lrl = [param_group['lr'] for param_group in optimizer.param_groups] + lr = sum(lrl) / len(lrl) + + if args.distributed: + reduced_loss = reduce_tensor(loss.data, args.world_size) + losses_m.update(reduced_loss.item(), input.size(0)) + + if args.is_master_node: + _logger.info( + 'Train: {} [{:>4d}/{} ({:>3.0f}%)] ' + 'Loss: {loss.val:>9.6f} ({loss.avg:>6.4f}) ' + 'Time: {batch_time.val:.3f}s, {rate:>7.2f}/s ' + '({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' + 'LR: {lr:.3e} ' + 'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format( + epoch, + batch_idx, len(loader), + 100. * batch_idx / last_idx, + loss=losses_m, + batch_time=batch_time_m, + rate=input.size(0) * args.world_size / batch_time_m.val, + rate_avg=input.size(0) * args.world_size / batch_time_m.avg, + lr=lr, + data_time=data_time_m)) + + if args.save_images and output_dir: + torchvision.utils.save_image( + input, + os.path.join(output_dir, 'train-batch-%d.jpg' % batch_idx), + padding=0, + normalize=True) + + if saver is not None and args.recovery_interval and ( + last_batch or (batch_idx + 1) % args.recovery_interval == 0): + saver.save_recovery(epoch, batch_idx=batch_idx) + + if lr_scheduler is not None: + lr_scheduler.step_update(num_updates=num_updates, metric=losses_m.avg) + + end = time.time() + # end for + + if hasattr(optimizer, 'sync_lookahead'): + optimizer.sync_lookahead() + + return OrderedDict([('loss', losses_m.avg)]) + + +def validate(device, model, loader, loss_fn, args, amp_autocast=suppress, log_suffix=''): + batch_time_m = AverageMeter() + losses_m = AverageMeter() + top1_m = AverageMeter() + top5_m = AverageMeter() + + model.eval() + + end = time.time() + last_idx = len(loader) - 1 + with torch.no_grad(): + for batch_idx, (input, target) in enumerate(loader): + last_batch = batch_idx == last_idx + if not args.prefetcher: + input = input.to(device) + target = target.to(device) + if args.channels_last: + input = input.contiguous(memory_format=torch.channels_last) + + with amp_autocast(): + output = model(input) + if isinstance(output, (tuple, list)): + output = output[0] + if args.cls_weight==0: + output=output[1].mean(1) + + # augmentation reduction + reduce_factor = args.tta + if reduce_factor > 1: + output = output.unfold(0, reduce_factor, reduce_factor).mean(dim=2) + target = target[0:target.size(0):reduce_factor] + + loss = loss_fn(output, target) + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + + if args.distributed: + reduced_loss = reduce_tensor(loss.data, args.world_size) + acc1 = reduce_tensor(acc1, args.world_size) + acc5 = reduce_tensor(acc5, args.world_size) + else: + reduced_loss = loss.data + + torch.npu.synchronize() + + losses_m.update(reduced_loss.item(), input.size(0)) + top1_m.update(acc1.item(), output.size(0)) + top5_m.update(acc5.item(), output.size(0)) + + batch_time_m.update(time.time() - end) + end = time.time() + if args.is_master_node and (last_batch or batch_idx % args.log_interval == 0): + log_name = 'Test' + log_suffix + _logger.info( + '{0}: [{1:>4d}/{2}] ' + 'Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) ' + 'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f}) ' + 'Acc@1: {top1.val:>7.4f} ({top1.avg:>7.4f}) ' + 'Acc@5: {top5.val:>7.4f} ({top5.avg:>7.4f})'.format( + log_name, batch_idx, last_idx, batch_time=batch_time_m, + loss=losses_m, top1=top1_m, top5=top5_m)) + + metrics = OrderedDict([('loss', losses_m.avg), ('top1', top1_m.avg), ('top5', top5_m.avg)]) + + return metrics + + +if __name__ == '__main__': + main() diff --git a/PyTorch/contrib/cv/classification/LVVIT/modelzoo_level.txt b/PyTorch/contrib/cv/classification/LVVIT/modelzoo_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..31529da2e68f25b61e2a3e698a07537281443c03 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/modelzoo_level.txt @@ -0,0 +1,3 @@ +FuncStatus:OK +PerfStatus:OK +PrecisionStatus:OK \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LVVIT/requirements.txt b/PyTorch/contrib/cv/classification/LVVIT/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..5752b6818787e99f5243d5e9409ac34f25f7f9da --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/requirements.txt @@ -0,0 +1,20 @@ +apex==0.1+ascend +attr==0.3.1 +attrs==21.2.0 +h5py==2.8.0 +matplotlib==3.5.1 +numpy==1.21.4 +packaging==21.3 +Pillow==8.4.0 +PyYAML==6.0 +scikit-learn==0.24.2 +scipy==1.7.2 +setuptools==40.4.3 +six==1.16.0 +tabulate==0.8.9 +te==0.4.0 +timm==0.4.5 +torch==1.5.0+ascend +torchvision==0.6.0a0 +tqdm==4.19.9 +wheel== 0.32.1 \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LVVIT/seg/configs/lvvit/upernet_lvvit_l_512x512_160k_ade20k.py b/PyTorch/contrib/cv/classification/LVVIT/seg/configs/lvvit/upernet_lvvit_l_512x512_160k_ade20k.py new file mode 100644 index 0000000000000000000000000000000000000000..a4e28c978bc253d65d022982913c3c3d56752b0d --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/seg/configs/lvvit/upernet_lvvit_l_512x512_160k_ade20k.py @@ -0,0 +1,78 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +_base_ = ['../_base_/datasets/ade20k.py','../_base_/default_runtime.py'] +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='', + backbone=dict( + type='ViT', + img_size=(448,448), + depth=24, + out_channels=2048, + out_indices=(14, 17, 20, 23), + patch_size=16, + drop_path_rate=0.5, + embed_dim=768, + num_heads=12, + mlp_ratio=3., + qkv_bias=False, + p_emb='4_2', + stem_dim=128, + use_side_layer=True), + decode_head=dict( + type='UPerHead', + in_channels=[768, 768, 768, 2048], + in_index=[0, 1, 2, 3], + pool_scales=(1, 2, 3, 6), + channels=512, + dropout_ratio=0.1, + num_classes=150, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=768, + in_index=2, + channels=512, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=150, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(341, 341))) + # test_cfg=dict(mode='whole')) + + +optimizer = dict(type='AdamW', lr=6e-5, betas=(0.9, 0.999), weight_decay=0.01) +optimizer_config = dict() +# learning policy +lr_config = dict(policy='poly', warmup='linear', warmup_iters=1500, warmup_ratio=1e-6, power=1.0, min_lr=0., by_epoch=False) +# runtime settings +runner = dict(type='IterBasedRunner', max_iters=160000) +checkpoint_config = dict(by_epoch=False, interval=8000) +evaluation = dict(interval=8000, metric='mIoU') +data=dict(samples_per_gpu=2) +fp16=dict() \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LVVIT/seg/configs/lvvit/upernet_lvvit_m_512x512_160k_ade20k.py b/PyTorch/contrib/cv/classification/LVVIT/seg/configs/lvvit/upernet_lvvit_m_512x512_160k_ade20k.py new file mode 100644 index 0000000000000000000000000000000000000000..b4ea6ec82d6d937017581aec672a97274b7253c6 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/seg/configs/lvvit/upernet_lvvit_m_512x512_160k_ade20k.py @@ -0,0 +1,76 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +_base_ = ['../_base_/datasets/ade20k.py','../_base_/default_runtime.py'] +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='', + backbone=dict( + type='ViT', + img_size=(512,512), + depth=20, + out_channels=2048, + out_indices=(10, 13, 16, 19), + patch_size=16, + drop_path_rate=0.2, + embed_dim=512, + num_heads=8, + mlp_ratio=3., + qkv_bias=False, + p_emb='4_2', + stem_dim=64, + use_side_layer=True, + skip_lam=2.0), + decode_head=dict( + type='UPerHead', + in_channels=[512, 512, 512, 2048], + in_index=[0, 1, 2, 3], + pool_scales=(1, 2, 3, 6), + channels=256, + dropout_ratio=0.1, + num_classes=150, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=512, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=150, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(341, 341))) + +optimizer = dict(type='AdamW', lr=6e-5, betas=(0.9, 0.999), weight_decay=0.01) +optimizer_config = dict() +# learning policy +lr_config = dict(policy='poly', warmup='linear', warmup_iters=1500, warmup_ratio=1e-6, power=1.0, min_lr=0., by_epoch=False) +# runtime settings +runner = dict(type='IterBasedRunner', max_iters=160000) +checkpoint_config = dict(by_epoch=False, interval=8000) +evaluation = dict(interval=8000, metric='mIoU') +fp16=dict() \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LVVIT/seg/configs/lvvit/upernet_lvvit_s_512x512_160k_ade20k.py b/PyTorch/contrib/cv/classification/LVVIT/seg/configs/lvvit/upernet_lvvit_s_512x512_160k_ade20k.py new file mode 100644 index 0000000000000000000000000000000000000000..5140dd5bf4793060c4b405b8b8b930302ac57cad --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/seg/configs/lvvit/upernet_lvvit_s_512x512_160k_ade20k.py @@ -0,0 +1,76 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +_base_ = ['../_base_/datasets/ade20k.py', '../_base_/default_runtime.py'] +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='', + backbone=dict( + type='ViT', + img_size=(512,512), + depth=16, + out_channels=2048, + out_indices=(6, 9, 12, 15), + patch_size=16, + drop_path_rate=0.1, + embed_dim=384, + num_heads=6, + mlp_ratio=3., + qkv_bias=False, + p_emb='4_2', + stem_dim=64, + use_side_layer=True, + skip_lam=2.0), + decode_head=dict( + type='UPerHead', + in_channels=[384, 384, 384, 2048], + in_index=[0, 1, 2, 3], + pool_scales=(1, 2, 3, 6), + channels=256, + dropout_ratio=0.1, + num_classes=150, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=384, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=150, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(341, 341))) + # test_cfg=dict(mode='whole')) +optimizer = dict(type='AdamW', lr=6e-5, betas=(0.9, 0.999), weight_decay=0.01) +optimizer_config = dict() +# learning policy +lr_config = dict(policy='poly', warmup='linear', warmup_iters=1500, warmup_ratio=1e-6, power=1.0, min_lr=0., by_epoch=False) +# runtime settings +runner = dict(type='IterBasedRunner', max_iters=160000) +checkpoint_config = dict(by_epoch=False, interval=8000) +evaluation = dict(interval=8000, metric='mIoU') +fp16=dict() \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LVVIT/seg/mmseg/models/backbones/vit.py b/PyTorch/contrib/cv/classification/LVVIT/seg/mmseg/models/backbones/vit.py new file mode 100644 index 0000000000000000000000000000000000000000..9939c2353610a3fb84b294b5ccef8a41a0d7131a --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/seg/mmseg/models/backbones/vit.py @@ -0,0 +1,605 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +""" +Modified from https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/backbones/vit.py +""" + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as cp +from mmcv.cnn import (Conv2d, Linear, build_activation_layer, build_norm_layer, + constant_init, kaiming_init, normal_init) +from mmcv.runner import _load_checkpoint +from mmcv.utils.parrots_wrapper import _BatchNorm + +from mmseg.utils import get_root_logger +from ..builder import BACKBONES +from ..utils import DropPath, trunc_normal_ +from functools import partial +from itertools import repeat +import collections.abc + + +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable): + return x + return tuple(repeat(x, n)) + return parse + +to_2tuple = _ntuple(2) +class Mlp(nn.Module): + """MLP layer for Encoder block. + + Args: + in_features(int): Input dimension for the first fully + connected layer. + hidden_features(int): Output dimension for the first fully + connected layer. + out_features(int): Output dementsion for the second fully + connected layer. + act_cfg(dict): Config dict for activation layer. + Default: dict(type='GELU'). + drop(float): Drop rate for the dropout layer. Dropout rate has + to be between 0 and 1. Default: 0. + """ + + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_cfg=dict(type='GELU'), + drop=0.): + super(Mlp, self).__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = Linear(in_features, hidden_features) + self.act = build_activation_layer(act_cfg) + self.fc2 = Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + """Attention layer for Encoder block. + + Args: + dim (int): Dimension for the input vector. + num_heads (int): Number of parallel attention heads. + qkv_bias (bool): Enable bias for qkv if True. Default: False. + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + attn_drop (float): Drop rate for attention output weights. + Default: 0. + proj_drop (float): Drop rate for output weights. Default: 0. + """ + + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0.): + super(Attention, self).__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + b, n, c = x.shape + qkv = self.qkv(x).reshape(b, n, 3, self.num_heads, + c // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1).float().cpu() + attn = self.attn_drop(attn).npu().half() + + x = (attn @ v).transpose(1, 2).reshape(b, n, c) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + """Implements encoder block with residual connection. + + Args: + dim (int): The feature dimension. + num_heads (int): Number of parallel attention heads. + mlp_ratio (int): Ratio of mlp hidden dim to embedding dim. + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + drop (float): Drop rate for mlp output weights. Default: 0. + attn_drop (float): Drop rate for attention output weights. + Default: 0. + proj_drop (float): Drop rate for attn layer output weights. + Default: 0. + drop_path (float): Drop rate for paths of model. + Default: 0. + act_cfg (dict): Config dict for activation layer. + Default: dict(type='GELU'). + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN', requires_grad=True). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + skip_lam (float): residual connection factor. Default: 1.0 + """ + + def __init__(self, + dim, + num_heads, + mlp_ratio=4, + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + proj_drop=0., + drop_path=0., + act_cfg=dict(type='GELU'), + norm_cfg=dict(type='LN', eps=1e-6), + with_cp=False, + skip_lam=1.0): + super(Block, self).__init__() + self.with_cp = with_cp + _, self.norm1 = build_norm_layer(norm_cfg, dim) + self.attn = Attention(dim, num_heads, qkv_bias, qk_scale, attn_drop, + proj_drop) + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + _, self.norm2 = build_norm_layer(norm_cfg, dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_cfg=act_cfg, + drop=drop) + self.skip_lam=skip_lam + + def forward(self, x): + + def _inner_forward(x): + out = x + self.drop_path(self.attn(self.norm1(x)))/self.skip_lam + out = out + self.drop_path(self.mlp(self.norm2(out)))/self.skip_lam + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +class PatchEmbed(nn.Module): + """Image to Patch Embedding. + + Args: + img_size (int | tuple): Input image size. + default: 224. + patch_size (int): Width and height for a patch. + default: 16. + in_channels (int): Input channels for images. Default: 3. + embed_dim (int): The embedding dimension. Default: 768. + """ + + def __init__(self, + img_size=224, + patch_size=16, + in_channels=3, + embed_dim=768): + super(PatchEmbed, self).__init__() + if isinstance(img_size, int): + self.img_size = (img_size, img_size) + elif isinstance(img_size, tuple): + self.img_size = img_size + else: + raise TypeError('img_size must be type of int or tuple') + h, w = self.img_size + self.patch_size = (patch_size, patch_size) + self.num_patches = (h // patch_size) * (w // patch_size) + self.proj = Conv2d( + in_channels, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x): + return self.proj(x) + +class PatchEmbed4_2(nn.Module): + """ + Image to Patch Embedding with 4 layer convolution + """ + def __init__(self, img_size=224, patch_size=16, in_channels=3, embed_dim=768, stem_dim=64): + super().__init__() + new_patch_size = to_2tuple(patch_size // 2) + + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + self.embed_dim = embed_dim + + self.conv1 = nn.Conv2d(in_channels, stem_dim, kernel_size=7, stride=2, padding=3, bias=False) # 112x112 + self.bn1 = nn.BatchNorm2d(stem_dim) + self.relu = nn.ReLU(inplace=True) + self.conv2 = nn.Conv2d(stem_dim, stem_dim, kernel_size=3, stride=1, padding=1, bias=False) # 112x112 + self.bn2 = nn.BatchNorm2d(stem_dim) + self.conv3 = nn.Conv2d(stem_dim, stem_dim, kernel_size=3, stride=1, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(stem_dim) + + self.proj = nn.Conv2d(stem_dim, embed_dim, kernel_size=new_patch_size, stride=new_patch_size) + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + + x = self.conv3(x) + x = self.bn3(x) + x = self.relu(x) + + x = self.proj(x) # [B, C, W, H] + + return x + +@BACKBONES.register_module() +class VisionTransformer(nn.Module): + """Vision transformer backbone. + + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for + Image Recognition at Scale` - https://arxiv.org/abs/2010.11929 + + Args: + img_size (tuple): input image size. Default: (224, 224). + patch_size (int, tuple): patch size. Default: 16. + in_channels (int): number of input channels. Default: 3. + embed_dim (int): embedding dimension. Default: 768. + depth (int): depth of transformer. Default: 12. + num_heads (int): number of attention heads. Default: 12. + mlp_ratio (int): ratio of mlp hidden dim to embedding dim. + Default: 4. + out_indices (list | tuple | int): Output from which stages. + Default: -1. + qkv_bias (bool): enable bias for qkv if True. Default: True. + qk_scale (float): override default qk scale of head_dim ** -0.5 if set. + drop_rate (float): dropout rate. Default: 0. + attn_drop_rate (float): attention dropout rate. Default: 0. + drop_path_rate (float): Rate of DropPath. Default: 0. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN', eps=1e-6, requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='GELU'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + final_norm (bool): Whether to add a additional layer to normalize + final feature map. Default: False. + out_reshape (str): Select the output format of feature information. + Default: NCHW. + interpolate_mode (str): Select the interpolate mode for position + embeding vector resize. Default: bicubic. + with_cls_token (bool): If concatenating class token into image tokens + as transformer input. Default: True. + with_cp (bool): Use checkpoint or not. Using checkpoint + will save some memory while slowing down the training speed. + Default: False. + p_emb (str): Which Patch Embedding to use. + Default: None, using naive Patch Embedding. + stem_dim (int): hidden dim in Patch Embedding module. + Default: 64. + skip_lam (float): residual connection factor. + Default: 1.0. + use_side_layer (bool): whether use the side layer for UperNet and FCN. + Default: False (use the neck instead) + fcn (bool): switch between FCN and UperNet. + Default: False (use UperNet). + """ + + def __init__(self, + img_size=(224, 224), + patch_size=16, + in_channels=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + out_indices=11, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_cfg=dict(type='LN', eps=1e-6, requires_grad=True), + act_cfg=dict(type='GELU'), + norm_eval=False, + final_norm=False, + out_shape='NCHW', + with_cls_token=True, + interpolate_mode='bicubic', + with_cp=False, + out_channels=768, + p_emb=None, + stem_dim=64, + skip_lam=1.0, + use_side_layer=False, + fcn=False): + super(VisionTransformer, self).__init__() + self.img_size = img_size + self.patch_size = patch_size + self.features = self.embed_dim = embed_dim + if p_emb=='4_2': + patch_embed_fn = partial(PatchEmbed4_2,stem_dim=stem_dim) + else: + patch_embed_fn = PatchEmbed + self.patch_embed = patch_embed_fn( + img_size=img_size, + patch_size=patch_size, + in_channels=in_channels, + embed_dim=embed_dim) + + self.with_cls_token = with_cls_token + self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim)) + self.pos_embed = nn.Parameter( + torch.zeros(1, self.patch_embed.num_patches + 1, embed_dim)) + self.pos_drop = nn.Dropout(p=drop_rate) + self.num_patches = self.patch_embed.num_patches + if isinstance(out_indices, int): + self.out_indices = [out_indices] + elif isinstance(out_indices, list) or isinstance(out_indices, tuple): + self.out_indices = out_indices + else: + raise TypeError('out_indices must be type of int, list or tuple') + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=dpr[i], + attn_drop=attn_drop_rate, + act_cfg=act_cfg, + norm_cfg=norm_cfg, + with_cp=with_cp, + skip_lam=skip_lam) for i in range(depth) + ]) + self.use_side_layer = use_side_layer + if use_side_layer: + if not fcn: + self.side_layer1 = nn.Sequential( + nn.ConvTranspose2d(embed_dim, embed_dim, 4, stride=4, padding=0, bias=False), + nn.SyncBatchNorm(embed_dim), + nn.ReLU(True), + nn.Conv2d(embed_dim, embed_dim, 1, 1, 0, bias=False), + nn.SyncBatchNorm(embed_dim), + nn.ReLU(True), + ) + self.side_layer2 = nn.Sequential( + nn.ConvTranspose2d(embed_dim, embed_dim, 2, stride=2, padding=0, bias=False), + nn.SyncBatchNorm(embed_dim), + nn.ReLU(True), + nn.Conv2d(embed_dim, embed_dim, 1, 1, 0, bias=False), + nn.SyncBatchNorm(embed_dim), + nn.ReLU(True), + ) + self.side_layer3 = nn.Sequential( + nn.Conv2d(embed_dim, embed_dim, 1, 1, 0, bias=False), + nn.SyncBatchNorm(embed_dim), + nn.ReLU(True), + ) + self.side_layer4 = nn.Sequential( + nn.Conv2d(embed_dim, out_channels, 1, 1, 0, bias=False), + nn.SyncBatchNorm(out_channels), + nn.ReLU(True), + ) + else: + self.side_layer1 = nn.Identity() + self.side_layer2 = nn.Identity() + self.side_layer3 = nn.Sequential( + nn.Conv2d(embed_dim, embed_dim, 1, 1, 0, bias=False), + nn.SyncBatchNorm(embed_dim), + nn.ReLU(True), + ) + self.side_layer4 = nn.Sequential( + nn.ConvTranspose2d(embed_dim, embed_dim, 2, stride=2, padding=0, bias=False), + nn.SyncBatchNorm(embed_dim), + nn.ReLU(True), + nn.Conv2d(embed_dim, out_channels, 1, 1, 0, bias=False), + nn.SyncBatchNorm(out_channels), + nn.ReLU(True), + ) + assert out_shape in ['NLC', + 'NCHW'], 'output shape must be "NLC" or "NCHW".' + + self.out_shape = out_shape + + self.interpolate_mode = interpolate_mode + self.final_norm = final_norm + if final_norm: + _, self.norm = build_norm_layer(norm_cfg, embed_dim) + + self.norm_eval = norm_eval + self.with_cp = with_cp + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = get_root_logger() + checkpoint = _load_checkpoint(pretrained, logger=logger) + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + elif 'model' in checkpoint: + state_dict = checkpoint['model'] + else: + state_dict = checkpoint + + if 'pos_embed' in state_dict.keys(): + if self.pos_embed.shape != state_dict['pos_embed'].shape: + logger.info(msg=f'Resize the pos_embed shape from \ +{state_dict["pos_embed"].shape} to {self.pos_embed.shape}') + h, w = self.img_size + pos_size = int( + math.sqrt(state_dict['pos_embed'].shape[1] - 1)) + state_dict['pos_embed'] = self.resize_pos_embed( + state_dict['pos_embed'], (h, w), (pos_size, pos_size), + self.patch_size, self.interpolate_mode) + + self.load_state_dict(state_dict, False) + + elif pretrained is None: + # We only implement the 'jax_impl' initialization implemented at + # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353 # noqa: E501 + trunc_normal_(self.pos_embed, std=.02) + trunc_normal_(self.cls_token, std=.02) + for n, m in self.named_modules(): + if isinstance(m, Linear): + trunc_normal_(m.weight, std=.02) + if m.bias is not None: + if 'mlp' in n: + normal_init(m.bias, std=1e-6) + else: + constant_init(m.bias, 0) + elif isinstance(m, Conv2d): + kaiming_init(m.weight, mode='fan_in') + if m.bias is not None: + constant_init(m.bias, 0) + elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)): + constant_init(m.bias, 0) + constant_init(m.weight, 1.0) + else: + raise TypeError('pretrained must be a str or None') + + @staticmethod + def resize_pos_embed(pos_embed, input_shpae, pos_shape, patch_size, mode): + """Resize pos_embed weights. + + Resize pos_embed using bicubic interpolate method. + Args: + pos_embed (torch.Tensor): pos_embed weights. + input_shpae (tuple): Tuple for (input_h, intput_w). + pos_shape (tuple): Tuple for (pos_h, pos_w). + patch_size (int): Patch size. + Return: + torch.Tensor: The resized pos_embed of shape [B, L_new, C] + """ + assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]' + input_h, input_w = input_shpae + pos_h, pos_w = pos_shape + cls_token_weight = pos_embed[:, 0] + pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):] + pos_embed_weight = pos_embed_weight.reshape( + 1, pos_h, pos_w, pos_embed.shape[2]).permute(0, 3, 1, 2) + pos_embed_weight = F.interpolate( + pos_embed_weight, + size=[input_h // patch_size, input_w // patch_size], + align_corners=False, + mode=mode) + cls_token_weight = cls_token_weight.unsqueeze(1) + pos_embed_weight = torch.flatten(pos_embed_weight, 2).transpose(1, 2) + pos_embed = torch.cat((cls_token_weight, pos_embed_weight), dim=1) + return pos_embed + + def _pos_embeding(self, x, h, w): + """Positiong embeding method. + + Resize the pos_embed, if the input image size doesn't match + the training size. + Args: + x (torch.Tensor): The pos_embed weighs, it should be + shape of [B, L2, c]. + h (int): training feature map height + w (int): training feature map width + Return: + torch.Tensor: The pos encoded image feature. + """ + B,_,C = x.size() + ct = x[:,0].unsqueeze(2) + ts = x[:,1:].transpose(1, 2).reshape(B, C, int(self.num_patches ** 0.5), int(self.num_patches ** 0.5)) + ts = F.interpolate(ts, (h, w), mode='bicubic', align_corners=False) + ts = ts.flatten(2) + x = torch.cat([ct, ts], dim=2).transpose(1, 2) + return x + def forward(self, inputs): + B = inputs.shape[0] + + x = self.patch_embed(inputs) + B, C, H, W = x.size() + x = x.flatten(2).transpose(1, 2) + cls_tokens = self.cls_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + #x = self._pos_embeding(inputs, x, self.pos_embed) + x = x + self._pos_embeding(self.pos_embed, H, W) + + if not self.with_cls_token: + # Remove class token for transformer input + x = x[:, 1:] + + outs = [] + for i, blk in enumerate(self.blocks): + x = blk(x) + if i == len(self.blocks) - 1: + if self.final_norm: + x = self.norm(x) + if i in self.out_indices: + if self.with_cls_token: + # Remove class token and reshape token for decoder head + out = x[:, 1:] + else: + out = x + if self.out_shape == 'NCHW': + B, _, C = out.shape + out = out.reshape(B, H, W, C).permute(0, 3, 1, 2) + outs.append(out) + if self.use_side_layer: + outs[0] = self.side_layer1(outs[0]) + outs[1] = self.side_layer2(outs[1]) + outs[2] = self.side_layer3(outs[2]) + outs[3] = self.side_layer4(outs[3]) + return tuple(outs) + + def train(self, mode=True): + super(VisionTransformer, self).train(mode) + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, nn.LayerNorm): + m.eval() + +@BACKBONES.register_module() +class ViT(VisionTransformer): + def __init__(self, **kwargs): + super(ViT, self).__init__(**kwargs) \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LVVIT/setup.py b/PyTorch/contrib/cv/classification/LVVIT/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..132fab40f9d587de540f6a447c8cbcb4c0bd6f1c --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/setup.py @@ -0,0 +1,59 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +from setuptools import setup, find_packages +from codecs import open +from os import path + +here = path.abspath(path.dirname(__file__)) +# Get the long description from the README file +with open(path.join(here, 'README.md'), encoding='utf-8') as f: + long_description = f.read() + +setup( + name = 'tlt', + packages = find_packages(exclude=['seg','visualize']), + version = '0.2.0', + license='Apache License 2.0', + long_description=long_description, + long_description_content_type='text/markdown', + description = 'Token Labeling Toolbox for training image models', + author = 'Zihang Jiang', + author_email = 'jzh0103@gmail.com', + url = 'https://github.com/zihangJiang/TokenLabeling', + keywords = [ + 'imagenet', + 'attention mechanism', + 'transformer', + 'image classification', + 'token labeling' + ], + install_requires=[ + 'timm>=0.4.5', + 'torch>=1.5', + 'torchvision', + 'scipy', + ], + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Science/Research', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + 'License :: OSI Approved :: Apache Software License', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + ], +) diff --git a/PyTorch/contrib/cv/classification/LVVIT/test/env_npu.sh b/PyTorch/contrib/cv/classification/LVVIT/test/env_npu.sh new file mode 100644 index 0000000000000000000000000000000000000000..7cabb607aab44bf7262886a61767b11470c03559 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/test/env_npu.sh @@ -0,0 +1,75 @@ +#!/bin/bash +export install_path=/usr/local/Ascend + +if [ -d ${install_path}/toolkit ]; then + export LD_LIBRARY_PATH=${install_path}/fwkacllib/lib64/:/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH} + export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH + export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH + export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH + export ASCEND_OPP_PATH=${install_path}/opp +else + if [ -d ${install_path}/nnae/latest ];then + export LD_LIBRARY_PATH=${install_path}/nnae/latest/fwkacllib/lib64/:/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH + export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/ + export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/ + export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so + export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH + export ASCEND_AICPU_PATH=${install_path}/nnae/latest + else + export LD_LIBRARY_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH + export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/ + export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/ + export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so + export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH + export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest + fi +fi + +${install_path}/driver/tools/msnpureport -g error -d 0 +${install_path}/driver/tools/msnpureport -g error -d 1 +${install_path}/driver/tools/msnpureport -g error -d 2 +${install_path}/driver/tools/msnpureport -g error -d 3 +${install_path}/driver/tools/msnpureport -g error -d 4 +${install_path}/driver/tools/msnpureport -g error -d 5 +${install_path}/driver/tools/msnpureport -g error -d 6 +${install_path}/driver/tools/msnpureport -g error -d 7 + +#将Host日志输出到串口,0-关闭/1-开启 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +#设置默认日志级别,0-debug/1-info/2-warning/3-error +export ASCEND_GLOBAL_LOG_LEVEL=3 +#设置Event日志开启标志,0-关闭/1-开启 +export ASCEND_GLOBAL_EVENT_ENABLE=0 +#设置是否开启taskque,0-关闭/1-开启 +export TASK_QUEUE_ENABLE=1 +#设置是否开启PTCopy,0-关闭/1-开启 +export PTCOPY_ENABLE=1 +#设置是否开启combined标志,0-关闭/1-开启 +export COMBINED_ENABLE=1 +#设置特殊场景是否需要重新编译,不需要修改 +export DYNAMIC_OP="ADD#MUL" +#HCCL白名单开关,1-关闭/0-开启 +export HCCL_WHITELIST_DISABLE=1 + +ulimit -SHn 512000 + +path_lib=$(python3.7 -c """ +import sys +import re +result='' +for index in range(len(sys.path)): + match_sit = re.search('-packages', sys.path[index]) + if match_sit is not None: + match_lib = re.search('lib', sys.path[index]) + + if match_lib is not None: + end=match_lib.span()[1] + result += sys.path[index][0:end] + ':' + + result+=sys.path[index] + '/torch/lib:' +print(result)""" +) + +echo ${path_lib} + +export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH diff --git a/PyTorch/contrib/cv/classification/LVVIT/test/train_eval_8p.sh b/PyTorch/contrib/cv/classification/LVVIT/test/train_eval_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..0d66aee3da9a528ac7699cafcc715d3869870c65 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/test/train_eval_8p.sh @@ -0,0 +1,15 @@ +source env_npu.sh +if [ ! $1 ]; +then + DATA_DIR=/path/to/imagenet/val +else + DATA_DIR="$1" +fi +if [ ! $2 ]; +then + MODEL_DIR=/path/to/checkpoint +else + MODEL_DIR="$2" +fi +python3 validate.py $DATA_DIR --model lvvit_s --checkpoint $MODEL_DIR/lvvit_s-26m-224-83.3.pth.tar --no-test-pool --amp -b 64 +#python3 validate.py $DATA_DIR --model lvvit_s --checkpoint $MODEL_DIR/lvvit_s-26m-224-83.3.pth.tar --no-test-pool --amp --img-size 224 -b 64 diff --git a/PyTorch/contrib/cv/classification/LVVIT/test/train_finetune_1p.sh b/PyTorch/contrib/cv/classification/LVVIT/test/train_finetune_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..fdf6cc4e908c9934e91068031a9371fe29e85ffe --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/test/train_finetune_1p.sh @@ -0,0 +1,131 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="LVVIT_for_PyTorch" +# 训练batch_size +batch_size=256 +# 训练使用的npu卡数 +export RANK_SIZE=1 +# 数据集路径,保持为空,不需要修改 +data_path="/opt/npu/imagenet" + +# 训练epoch +train_epochs=300 +# 学习率 +learning_rate=1.6e-3 +# 加载数据进程数 +workers=32 + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --workers* ]];then + workers=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +source ./test/env_npu.sh +if [ -e "nohup.out" ]; then + rm -f nohup.out +fi +RANK_ID=0 + +nohup python3 main.py $1 \ + --device_id ${RANK_ID} \ + --model lvvit_s \ + -b 256 \ + --apex-amp \ + --img-size 224 \ + --drop-path 0.1 \ + --workers 32 \ + --token-label \ + --token-label-data $2 \ + --token-label-size 14 \ + --model-ema \ + --no-prefetcher \ + --finetune $3 & +wait +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +FPS=`cat nohup.out| grep '/s)' |tail -n 1 |awk -F '/s' '{print$1}'|awk -F ', ' '{print$2}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`cat nohup.out| grep 'Best metric:'|awk -F ':' '{print$2}'|awk -F ' ' '{print$1}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +cat nohup.out | grep "Loss:" | awk '{print $5}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LVVIT/test/train_full_1p.sh b/PyTorch/contrib/cv/classification/LVVIT/test/train_full_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..0e9b68662165a32713c7a4e165ebbd02aab6acff --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/test/train_full_1p.sh @@ -0,0 +1,133 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="LVVIT_for_PyTorch" +# 训练batch_size +batch_size=256 +# 训练使用的npu卡数 +export RANK_SIZE=1 +# 数据集路径,保持为空,不需要修改 +data_path="/opt/npu/imagenet" + +# 训练epoch +train_epochs=300 +# 学习率 +learning_rate=1.6e-3 +# 加载数据进程数 +workers=32 + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --workers* ]];then + workers=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +source ./test/env_npu.sh +if [ -e "nohup.out" ]; then + rm -f nohup.out +fi +RANK_ID=0 + +nohup python3 main.py $1 \ + --device_id ${RANK_ID} \ + --model lvvit_s \ + -b 256 \ + --epochs 300 \ + --apex-amp \ + --img-size 224 \ + --drop-path 0.1 \ + --workers 32 \ + --token-label \ + --token-label-data $2 \ + --token-label-size 14 \ + --model-ema \ + --no-prefetcher & + +wait + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +FPS=`cat nohup.out| grep '/s)' |tail -n 1 |awk -F '/s' '{print$1}'|awk -F ', ' '{print$2}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`cat nohup.out| grep 'Best metric:'|awk -F ':' '{print$2}'|awk -F ' ' '{print$1}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +cat nohup.out | grep "Loss:" | awk '{print $5}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LVVIT/test/train_full_8p.sh b/PyTorch/contrib/cv/classification/LVVIT/test/train_full_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..d071d1d27310250870882026271a46f072652355 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/test/train_full_8p.sh @@ -0,0 +1,143 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="LVVIT_for_PyTorch" +# 训练batch_size +batch_size=256 +# 训练使用的npu卡数 +export RANK_SIZE=8 +# 数据集路径,保持为空,不需要修改 +data_path="/opt/npu/imagenet" + +# 训练epoch +train_epochs=300 +# 学习率 +learning_rate=1.6e-3 +# 加载数据进程数 +workers=32 + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --workers* ]];then + workers=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +export RANK_SIZE=8 +KERNEL_NUM=$(($(nproc)/${RANK_SIZE})) + +source ./test/env_npu.sh +if [ -e "nohup.out" ]; then + rm -f nohup.out +fi + +for((RANK_ID=0;RANK_ID<$((RANK_SIZE));RANK_ID++)); + do + export RANK_ID=$RANK_ID + PID_START=$((KERNEL_NUM*RANK_ID)) + PID_END=$((PID_START+KERNEL_NUM-1)) + taskset -c ${PID_START}-${PID_END} nohup python3 -u main.py $1 \ + --distributed \ + --device_id ${RANK_ID} \ + --model lvvit_s \ + -b 256 \ + --epochs 300 \ + --apex-amp \ + --img-size 224 \ + --drop-path 0.1 \ + --workers 16 \ + --token-label \ + --token-label-data $2 \ + --token-label-size 14 \ + --model-ema \ + --no-prefetcher & + done + +wait + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +FPS=`cat nohup.out| grep '/s)' |tail -n 1 |awk -F '/s' '{print$1}'|awk -F ', ' '{print$2}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`cat nohup.out| grep 'Best metric:'|awk -F ':' '{print$2}'|awk -F ' ' '{print$1}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +cat nohup.out | grep "Loss:" | awk '{print $5}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LVVIT/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/LVVIT/test/train_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..ffdcf9be1d7872c1c7b3a19c67568a1c7d335180 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/test/train_performance_1p.sh @@ -0,0 +1,133 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="LVVIT_for_PyTorch" +# 训练batch_size +batch_size=256 +# 训练使用的npu卡数 +export RANK_SIZE=1 +# 数据集路径,保持为空,不需要修改 +data_path="/opt/npu/imagenet" + +# 训练epoch +train_epochs=300 +# 学习率 +learning_rate=1.6e-3 +# 加载数据进程数 +workers=32 + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --workers* ]];then + workers=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +source ./test/env_npu.sh +if [ -e "nohup.out" ]; then + rm -f nohup.out +fi +RANK_ID=0 + +nohup python3 main.py $1 \ + --device_id ${RANK_ID} \ + --model lvvit_s \ + -b 256 \ + --epochs 2 \ + --apex-amp \ + --img-size 224 \ + --drop-path 0.1 \ + --workers 32 \ + --token-label \ + --token-label-data $2 \ + --token-label-size 14 \ + --model-ema \ + --no-prefetcher & +wait + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +FPS=`cat nohup.out| grep '/s)' |tail -n 1 |awk -F '/s' '{print$1}'|awk -F ', ' '{print$2}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`cat nohup.out| grep 'Best metric:'|awk -F ':' '{print$2}'|awk -F ' ' '{print$1}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +#TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n", "'"${batch_size}"'"*1000/"'"${FPS}"'"}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +cat nohup.out | grep "Loss:" | awk '{print $5}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +#echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LVVIT/test/train_performance_8p.sh b/PyTorch/contrib/cv/classification/LVVIT/test/train_performance_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..ef5a37a803596a259e5957fe3f230c5b2a555bdb --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/test/train_performance_8p.sh @@ -0,0 +1,142 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="LVVIT_for_PyTorch" +# 训练batch_size +batch_size=256 +# 训练使用的npu卡数 +export RANK_SIZE=8 +# 数据集路径,保持为空,不需要修改 +data_path="/opt/npu/imagenet" + +# 训练epoch +train_epochs=300 +# 学习率 +learning_rate=1.6e-3 +# 加载数据进程数 +workers=32 + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --workers* ]];then + workers=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +export RANK_SIZE=8 +KERNEL_NUM=$(($(nproc)/${RANK_SIZE})) + +source ./test/env_npu.sh +if [ -e "nohup.out" ]; then + rm -f nohup.out +fi + +for((RANK_ID=0;RANK_ID<$((RANK_SIZE));RANK_ID++)); + do + export RANK_ID=$RANK_ID + PID_START=$((KERNEL_NUM*RANK_ID)) + PID_END=$((PID_START+KERNEL_NUM-1)) + taskset -c ${PID_START}-${PID_END} nohup python3 -u main.py $1 \ + --distributed \ + --device_id ${RANK_ID} \ + --model lvvit_s \ + -b 256 \ + --epochs 2 \ + --apex-amp \ + --img-size 224 \ + --drop-path 0.1 \ + --workers 16 \ + --token-label \ + --token-label-data $2 \ + --token-label-size 14 \ + --model-ema \ + --no-prefetcher & + done +wait + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +FPS=`cat nohup.out| grep '/s)' |tail -n 1 |awk -F '/s' '{print$1}'|awk -F ', ' '{print$2}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`cat nohup.out| grep 'Best metric:'|awk -F ':' '{print$2}'|awk -F ' ' '{print$1}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +cat nohup.out | grep "Loss:" | awk '{print $5}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/__init__.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..40a242ff224b99725711c7ad963de7eec65f19b5 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/data/__init__.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aed0bc30900dbdd5f003e436cb5ae7bab4ce9f49 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/__init__.py @@ -0,0 +1,21 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +from .dataset import DatasetTokenLabel, create_token_label_dataset +from .loader import create_token_label_loader +from .label_transforms_factory import create_token_label_transform +from .mixup import TokenLabelMixup, FastCollateTokenLabelMixup, mixup_target as create_token_label_target +from .loader import create_loader diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/data/dataset.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..5ddf6d9678eda51f6e6880917d107479b2130bbd --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/dataset.py @@ -0,0 +1,143 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 +""" Image dataset with label maps +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch.utils.data as data + +import os +import re +import torch +import tarfile +import logging +from PIL import Image +_logger = logging.getLogger('token_label_dataset') + +IMG_EXTENSIONS = ['.png', '.jpg', '.jpeg'] + + +def natural_key(string_): + """See http://www.codinghorror.com/blog/archives/001018.html""" + return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())] + + +def find_images_and_targets(folder, types=IMG_EXTENSIONS, class_to_idx=None, leaf_name_only=True, sort=True): + labels = [] + filenames = [] + for root, subdirs, files in os.walk(folder, topdown=False): + rel_path = os.path.relpath(root, folder) if (root != folder) else '' + label = os.path.basename(rel_path) if leaf_name_only else rel_path.replace(os.path.sep, '_') + for f in files: + base, ext = os.path.splitext(f) + if ext.lower() in types: + filenames.append(os.path.join(root, f)) + labels.append(label) + if class_to_idx is None: + # building class index + unique_labels = set(labels) + sorted_labels = list(sorted(unique_labels, key=natural_key)) + class_to_idx = {c: idx for idx, c in enumerate(sorted_labels)} + images_and_targets = [(f, class_to_idx[l]) for f, l in zip(filenames, labels) if l in class_to_idx] + if sort: + images_and_targets = sorted(images_and_targets, key=lambda k: natural_key(k[0])) + return images_and_targets, class_to_idx + + +def load_class_map(filename, root=''): + class_map_path = filename + if not os.path.exists(class_map_path): + class_map_path = os.path.join(root, filename) + assert os.path.exists(class_map_path), 'Cannot locate specified class map file (%s)' % filename + class_map_ext = os.path.splitext(filename)[-1].lower() + if class_map_ext == '.txt': + with open(class_map_path) as f: + class_to_idx = {v.strip(): k for k, v in enumerate(f)} + else: + assert False, 'Unsupported class map extension' + return class_to_idx + + +class DatasetTokenLabel(data.Dataset): + + def __init__( + self, + root, + label_root, + load_bytes=False, + transform=None, + class_map=''): + + class_to_idx = None + if class_map: + class_to_idx = load_class_map(class_map, root) + images, class_to_idx = find_images_and_targets(root, class_to_idx=class_to_idx) + if len(images) == 0: + raise RuntimeError(f'Found 0 images in subfolders of {root}. ' + f'Supported image extensions are {", ".join(IMG_EXTENSIONS)}') + self.root = root + self.label_root = label_root + self.samples = images + self.imgs = self.samples # torchvision ImageFolder compat + self.class_to_idx = class_to_idx + self.load_bytes = load_bytes + self.transform = transform + + def __getitem__(self, index): + path, target = self.samples[index] + score_path = os.path.join( + self.label_root, + '/'.join(path.split('/')[-2:]).split('.')[0] + '.pt') + + img = open(path, 'rb').read() if self.load_bytes else Image.open(path).convert('RGB') + score_maps = torch.load(score_path).float() + if self.transform is not None: + img, score_maps = self.transform(img, score_maps) + # append ground truth after coords + score_maps[-1,0,0,5]=target + return img, score_maps + + def __len__(self): + return len(self.samples) + + def filename(self, index, basename=False, absolute=False): + filename = self.samples[index][0] + if basename: + filename = os.path.basename(filename) + elif not absolute: + filename = os.path.relpath(filename, self.root) + return filename + + def filenames(self, basename=False, absolute=False): + fn = lambda x: x + if basename: + fn = os.path.basename + elif not absolute: + fn = lambda x: os.path.relpath(x, self.root) + return [fn(x[0]) for x in self.samples] + + +def create_token_label_dataset(dataset_type, root, label_root): + train_dir = os.path.join(root, 'train') + if not os.path.exists(train_dir): + _logger.error('Training folder does not exist at: {}'.format(train_dir)) + exit(1) + if not os.path.exists(label_root): + _logger.error('Label folder does not exist at: {}'.format(label_root)) + exit(1) + return DatasetTokenLabel(train_dir, label_root) \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/data/label_transforms_factory.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/label_transforms_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..08a8414f7a679ef23fcd5eac536c70c9ada5ba57 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/label_transforms_factory.py @@ -0,0 +1,234 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +""" Transforms Factory + +Adapted for token labeling +""" +import math + +import torch +from torchvision import transforms + +from .random_augment_label import rand_augment_transform +from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, DEFAULT_CROP_PCT +from timm.data.transforms import _pil_interp, RandomResizedCropAndInterpolation, ToNumpy, ToTensor +from timm.data.random_erasing import RandomErasing +import random + +import torchvision +from torchvision.transforms import functional as torchvision_F +from PIL import Image + +_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC) + +class ComposeWithLabel(torchvision.transforms.Compose): + def __init__(self, **kwargs): + super(ComposeWithLabel, self).__init__(**kwargs) + + def __call__(self, img, label_map): + for t in self.transforms: + if type(t).__name__ == 'RandomHorizontalFlipWithLabel': + img, label_map = t(img, label_map) + elif type(t).__name__ == 'RandomVerticalFlipWithLabel': + img, label_map = t(img, label_map) + elif type(t).__name__ == 'RandAugment': + img, label_map = t(img, label_map) + elif type(t).__name__ == 'RandomResizedCropAndInterpolationWithCoords': + # should ensure RandomResizedCropWithCoords after all trabsformation + img, label_map = t(img, label_map) + else: + img = t(img) + return img, label_map + +class RandomResizedCropAndInterpolationWithCoords(RandomResizedCropAndInterpolation): + def __init__(self, size, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.), + interpolation='bilinear'): + if isinstance(size, tuple): + self.size = size + else: + self.size = (size, size) + if (scale[0] > scale[1]) or (ratio[0] > ratio[1]): + warnings.warn("range should be of kind (min, max)") + + if interpolation == 'random': + self.interpolation = _RANDOM_INTERPOLATION + else: + self.interpolation = _pil_interp(interpolation) + self.scale = scale + self.ratio = ratio + + def __call__(self, img, label_map): + i, j, h, w = self.get_params(img, self.scale, self.ratio) + coords = (i / img.size[1], + j / img.size[0], + h / img.size[1], + w / img.size[0]) + coords_map = torch.zeros_like(label_map[0:1]) + # trick to store coords_map is label_map + coords_map[0,0,0,0],coords_map[0,0,0,1],coords_map[0,0,0,2],coords_map[0,0,0,3] = coords + label_map = torch.cat([label_map, coords_map]) + if isinstance(self.interpolation, (tuple, list)): + interpolation = random.choice(self.interpolation) + else: + interpolation = self.interpolation + return torchvision_F.resized_crop(img, i, j, h, w, self.size, + interpolation), label_map + +class RandomHorizontalFlipWithLabel(torchvision.transforms.RandomHorizontalFlip): + def __init__(self, **kwargs): + super(RandomHorizontalFlipWithLabel, self).__init__(**kwargs) + + def __call__(self, img, label): + if torch.rand(1) < self.p: + return torchvision_F.hflip(img), label.flip(3) + return img, label + +class RandomVerticalFlipWithLabel(torchvision.transforms.RandomVerticalFlip): + def __init__(self, **kwargs): + super(RandomVerticalFlipWithLabel, self).__init__(**kwargs) + + def __call__(self, img, label): + if torch.rand(1) < self.p: + return torchvision_F.vflip(img), label.flip(2) + return img, label + + +def transforms_imagenet_train( + img_size=224, + scale=None, + ratio=None, + hflip=0.5, + vflip=0., + color_jitter=0.4, + auto_augment=None, + interpolation='random', + use_prefetcher=False, + mean=IMAGENET_DEFAULT_MEAN, + std=IMAGENET_DEFAULT_STD, + re_prob=0., + re_mode='const', + re_count=1, + re_num_splits=0, + separate=False, +): + """ + If separate==True, the transforms are returned as a tuple of 3 separate transforms + for use in a mixing dataset that passes + * all data through the first (primary) transform, called the 'clean' data + * a portion of the data through the secondary transform + * normalizes and converts the branches above with the third, final transform + """ + scale = tuple(scale or (0.08, 1.0)) # default imagenet scale range + ratio = tuple(ratio or (3./4., 4./3.)) # default imagenet ratio range + + primary_tfl=[] + if hflip > 0.: + primary_tfl += [RandomHorizontalFlipWithLabel(p=hflip)] + if vflip > 0.: + primary_tfl += [RandomVerticalFlipWithLabel(p=vflip)] + + secondary_tfl = [] + if auto_augment: + assert isinstance(auto_augment, str) + if isinstance(img_size, tuple): + img_size_min = min(img_size) + else: + img_size_min = img_size + aa_params = dict( + translate_const=int(img_size_min * 0.45), + img_mean=tuple([min(255, round(255 * x)) for x in mean]), + ) + if interpolation and interpolation != 'random': + aa_params['interpolation'] = _pil_interp(interpolation) + if auto_augment.startswith('rand'): + secondary_tfl += [rand_augment_transform(auto_augment, aa_params)] + + elif color_jitter is not None: + # color jitter is enabled when not using AA + if isinstance(color_jitter, (list, tuple)): + # color jitter should be a 3-tuple/list if spec brightness/contrast/saturation + # or 4 if also augmenting hue + assert len(color_jitter) in (3, 4) + else: + # if it's a scalar, duplicate for brightness, contrast, and saturation, no hue + color_jitter = (float(color_jitter),) * 3 + secondary_tfl += [transforms.ColorJitter(*color_jitter)] + + final_tfl = [RandomResizedCropAndInterpolationWithCoords(size=img_size, scale=scale, ratio=ratio, interpolation=interpolation)] + + if use_prefetcher: + # prefetcher and collate will handle tensor conversion and norm + final_tfl += [ToNumpy()] + else: + final_tfl += [ + transforms.ToTensor(), + transforms.Normalize( + mean=torch.tensor(mean), + std=torch.tensor(std)) + ] + if re_prob > 0.: + final_tfl.append( + RandomErasing(re_prob, mode=re_mode, max_count=re_count, num_splits=re_num_splits, device='cpu')) + return ComposeWithLabel(transforms=primary_tfl + secondary_tfl + final_tfl) + + +def create_token_label_transform( + input_size, + is_training=False, + use_prefetcher=False, + no_aug=False, + scale=None, + ratio=None, + hflip=0.5, + vflip=0., + color_jitter=0.4, + auto_augment=None, + interpolation='bilinear', + mean=IMAGENET_DEFAULT_MEAN, + std=IMAGENET_DEFAULT_STD, + re_prob=0., + re_mode='const', + re_count=1, + re_num_splits=0, + crop_pct=None, + tf_preprocessing=False, + separate=False,): + + if isinstance(input_size, tuple): + img_size = input_size[-2:] + else: + img_size = input_size + + transform = transforms_imagenet_train( + img_size, + scale=scale, + ratio=ratio, + hflip=hflip, + vflip=vflip, + color_jitter=color_jitter, + auto_augment=auto_augment, + interpolation=interpolation, + use_prefetcher=use_prefetcher, + mean=mean, + std=std, + re_prob=re_prob, + re_mode=re_mode, + re_count=re_count, + re_num_splits=re_num_splits, + separate=separate) + + return transform diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/data/loader.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..70ff19759bf86a8342ac8291beed777f1ff151d3 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/loader.py @@ -0,0 +1,393 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +""" Loader Factory, Fast Collate, CUDA Prefetcher +Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/loader.py and modified for token labeling +""" + +import torch.utils.data +import numpy as np + + +from .mixup import FastCollateTokenLabelMixup +from .label_transforms_factory import create_token_label_transform + +from timm.data import create_transform +from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from timm.data.distributed_sampler import OrderedDistributedSampler +from timm.data.random_erasing import RandomErasing + + + +def fast_collate(batch): + """ A fast collation function optimized for uint8 images (np array or torch) and int64 targets (labels)""" + assert isinstance(batch[0], tuple) + batch_size = len(batch) + if isinstance(batch[0][0], tuple): + # This branch 'deinterleaves' and flattens tuples of input tensors into one tensor ordered by position + # such that all tuple of position n will end up in a torch.split(tensor, batch_size) in nth position + inner_tuple_size = len(batch[0][0]) + flattened_batch_size = batch_size * inner_tuple_size + targets = torch.zeros(flattened_batch_size, dtype=torch.int64) + tensor = torch.zeros((flattened_batch_size, *batch[0][0][0].shape), dtype=torch.uint8) + for i in range(batch_size): + assert len(batch[i][0]) == inner_tuple_size # all input tensor tuples must be same length + for j in range(inner_tuple_size): + targets[i + j * batch_size] = batch[i][1] + tensor[i + j * batch_size] += torch.from_numpy(batch[i][0][j]) + return tensor, targets + elif isinstance(batch[0][0], np.ndarray): + if isinstance(batch[0][1], torch.Tensor): + targets = torch.stack([b[1] for b in batch]) + else: + targets = torch.tensor([b[1] for b in batch], dtype=torch.int64) + assert len(targets) == batch_size + tensor = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8) + for i in range(batch_size): + tensor[i] += torch.from_numpy(batch[i][0]) + return tensor, targets + elif isinstance(batch[0][0], torch.Tensor): + targets = torch.tensor([b[1] for b in batch], dtype=torch.int64) + assert len(targets) == batch_size + tensor = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8) + for i in range(batch_size): + tensor[i].copy_(batch[i][0]) + return tensor, targets + else: + assert False + + +class PrefetchLoader: + + def __init__(self, + loader, + mean=IMAGENET_DEFAULT_MEAN, + std=IMAGENET_DEFAULT_STD, + fp16=False, + re_prob=0., + re_mode='const', + re_count=1, + re_num_splits=0, + device='cuda'): + self.loader = loader + self.mean = torch.tensor([x * 255 for x in mean]).npu().view(1, 3, 1, 1) + self.std = torch.tensor([x * 255 for x in std]).npu().view(1, 3, 1, 1) + self.fp16 = fp16 + if fp16: + self.mean = self.mean.half() + self.std = self.std.half() + if re_prob > 0.: + self.random_erasing = RandomErasing( + probability=re_prob, mode=re_mode, max_count=re_count, num_splits=re_num_splits, device=device) + else: + self.random_erasing = None + + def __iter__(self): + stream = torch.npu.Stream() + first = True + + for next_input, next_target in self.loader: + with torch.npu.stream(stream): + next_input = next_input.npu(non_blocking=True) + next_target = next_target.npu(non_blocking=True) + if self.fp16: + next_input = next_input.half().sub_(self.mean).div_(self.std) + else: + next_input = next_input.float().sub_(self.mean).div_(self.std) + if self.random_erasing is not None: + next_input = self.random_erasing(next_input) + + if not first: + yield input, target + else: + first = False + + torch.npu.current_stream().wait_stream(stream) + input = next_input + target = next_target + + yield input, target + + def __len__(self): + return len(self.loader) + + @property + def sampler(self): + return self.loader.sampler + + @property + def dataset(self): + return self.loader.dataset + + @property + def mixup_enabled(self): + if isinstance(self.loader.collate_fn, FastCollateTokenLabelMixup): + return self.loader.collate_fn.mixup_enabled + else: + return False + + @mixup_enabled.setter + def mixup_enabled(self, x): + if isinstance(self.loader.collate_fn, FastCollateTokenLabelMixup): + self.loader.collate_fn.mixup_enabled = x + + +############################## +# add argument device + +def create_loader( + device, + dataset, + input_size, + batch_size, + is_training=False, + use_prefetcher=True, + no_aug=False, + re_prob=0., + re_mode='const', + re_count=1, + re_split=False, + scale=None, + ratio=None, + hflip=0.5, + vflip=0., + color_jitter=0.4, + auto_augment=None, + num_aug_splits=0, + interpolation='bilinear', + mean=IMAGENET_DEFAULT_MEAN, + std=IMAGENET_DEFAULT_STD, + num_workers=1, + distributed=False, + crop_pct=None, + collate_fn=None, + pin_memory=False, + fp16=False, + tf_preprocessing=False, + use_multi_epochs_loader=False, + persistent_workers=True, +): + re_num_splits = 0 + if re_split: + # apply RE to second half of batch if no aug split otherwise line up with aug split + re_num_splits = num_aug_splits or 2 + dataset.transform = create_transform( + input_size, + is_training=is_training, + use_prefetcher=use_prefetcher, + no_aug=no_aug, + scale=scale, + ratio=ratio, + hflip=hflip, + vflip=vflip, + color_jitter=color_jitter, + auto_augment=auto_augment, + interpolation=interpolation, + mean=mean, + std=std, + crop_pct=crop_pct, + tf_preprocessing=tf_preprocessing, + re_prob=re_prob, + re_mode=re_mode, + re_count=re_count, + re_num_splits=re_num_splits, + separate=num_aug_splits > 0, + ) + + sampler = None + if distributed and not isinstance(dataset, torch.utils.data.IterableDataset): + if is_training: + sampler = torch.utils.data.distributed.DistributedSampler(dataset) + else: + # This will add extra duplicate entries to result in equal num + # of samples per-process, will slightly alter validation results + sampler = OrderedDistributedSampler(dataset) + + if collate_fn is None: + collate_fn = fast_collate if use_prefetcher else torch.utils.data.dataloader.default_collate + + loader_class = torch.utils.data.DataLoader + + if use_multi_epochs_loader: + loader_class = MultiEpochsDataLoader + + loader_args = dict( + batch_size=batch_size, + shuffle=(sampler is None), + num_workers=num_workers, + sampler=sampler, + collate_fn=collate_fn, + pin_memory=pin_memory, + drop_last=is_training, + persistent_workers=persistent_workers) + try: + loader = loader_class(dataset, **loader_args) + except TypeError as e: + loader_args.pop('persistent_workers') # only in Pytorch 1.7+ + loader = loader_class(dataset, **loader_args) + if use_prefetcher: + prefetch_re_prob = re_prob if is_training and not no_aug else 0. + loader = PrefetchLoader( + loader, + mean=mean, + std=std, + fp16=fp16, + re_prob=prefetch_re_prob, + re_mode=re_mode, + re_count=re_count, + re_num_splits=re_num_splits, + device=device + ) + + return loader + +def create_token_label_loader( + device, + dataset, + input_size, + batch_size, + is_training=False, + use_prefetcher=True, + no_aug=False, + re_prob=0., + re_mode='const', + re_count=1, + re_split=False, + scale=None, + ratio=None, + hflip=0.5, + vflip=0., + color_jitter=0.4, + auto_augment=None, + num_aug_splits=0, + interpolation='bilinear', + mean=IMAGENET_DEFAULT_MEAN, + std=IMAGENET_DEFAULT_STD, + num_workers=1, + distributed=False, + crop_pct=None, + collate_fn=None, + pin_memory=False, + fp16=False, + tf_preprocessing=False, + use_multi_epochs_loader=False, + use_token_label=False, +): + re_num_splits = 0 + if re_split: + # apply RE to second half of batch if no aug split otherwise line up with aug split + re_num_splits = num_aug_splits or 2 + if use_token_label: + transform_fn=create_token_label_transform + else: + transform_fn=create_transform + dataset.transform = transform_fn( + input_size, + is_training=is_training, + use_prefetcher=use_prefetcher, + no_aug=no_aug, + scale=scale, + ratio=ratio, + hflip=hflip, + vflip=vflip, + color_jitter=color_jitter, + auto_augment=auto_augment, + interpolation=interpolation, + mean=mean, + std=std, + crop_pct=crop_pct, + tf_preprocessing=tf_preprocessing, + re_prob=re_prob, + re_mode=re_mode, + re_count=re_count, + re_num_splits=re_num_splits, + separate=num_aug_splits > 0, + ) + + sampler = None + if distributed and not isinstance(dataset, torch.utils.data.IterableDataset): + if is_training: + sampler = torch.utils.data.distributed.DistributedSampler(dataset) + else: + # This will add extra duplicate entries to result in equal num + # of samples per-process, will slightly alter validation results + sampler = OrderedDistributedSampler(dataset) + + if collate_fn is None: + collate_fn = fast_collate if use_prefetcher else torch.utils.data.dataloader.default_collate + + loader_class = torch.utils.data.DataLoader + + if use_multi_epochs_loader: + loader_class = MultiEpochsDataLoader + + loader = loader_class( + dataset, + batch_size=batch_size, + shuffle=(sampler is None), + num_workers=num_workers, + sampler=sampler, + collate_fn=collate_fn, + pin_memory=pin_memory, + drop_last=is_training, + ) + if use_prefetcher: + prefetch_re_prob = re_prob if is_training and not no_aug else 0. + loader = PrefetchLoader( + loader, + mean=mean, + std=std, + fp16=fp16, + re_prob=prefetch_re_prob, + re_mode=re_mode, + re_count=re_count, + re_num_splits=re_num_splits, + device=device + ) + + return loader + + +class MultiEpochsDataLoader(torch.utils.data.DataLoader): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._DataLoader__initialized = False + self.batch_sampler = _RepeatSampler(self.batch_sampler) + self._DataLoader__initialized = True + self.iterator = super().__iter__() + + def __len__(self): + return len(self.batch_sampler.sampler) + + def __iter__(self): + for i in range(len(self)): + yield next(self.iterator) + + +class _RepeatSampler(object): + """ Sampler that repeats forever. + + Args: + sampler (Sampler) + """ + + def __init__(self, sampler): + self.sampler = sampler + + def __iter__(self): + while True: + yield from iter(self.sampler) diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/data/mixup.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/mixup.py new file mode 100644 index 0000000000000000000000000000000000000000..fb252c0698aa107ae609c22bad8a6a4828482702 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/mixup.py @@ -0,0 +1,409 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +import numpy as np +import torch +from torchvision.ops import roi_align +from torch.contrib.npu.optimized_lib import module as nnn + +def one_hot(x, num_classes, on_value=1., off_value=0., device='cuda'): + x = x.long().view(-1, 1) + return torch.full((x.size()[0], num_classes), off_value, device=device).scatter_(1, x, on_value) + +def get_featuremaps(label_maps_topk, num_classes, device='cuda'): + label_maps_topk_sizes = label_maps_topk[0].size() + label_maps = torch.full([label_maps_topk.size(0), num_classes, label_maps_topk_sizes[2], + label_maps_topk_sizes[3]], 0, dtype=torch.float32 ,device=device) + for _label_map, _label_topk in zip(label_maps, label_maps_topk): + _label_map = _label_map.scatter_( + 0, + _label_topk[1][:, :, :].long(), + _label_topk[0][:, :, :].float() + ) + return label_maps + +def get_label(label_maps, batch_coords,label_size=1,device='cuda'): + ''' + Adapted from https://github.com/naver-ai/relabel_imagenet/blob/main/utils/relabel_functions.py + Here we generate label for patch tokens and cls token separately and concat them together if given label_size>1 + ''' + num_batches = label_maps.size(0) + roialign1 = nnn.ROIAlign((label_size, label_size), 1.0, 2, False) + target_label = roialign1(label_maps, torch.cat( + [torch.arange(num_batches).view(num_batches, + 1).float().to(device), + batch_coords.float() * label_maps.size(3) - 0.5], 1)) + + if label_size>1: + roialign2 = nnn.ROIAlign((1, 1), 1.0, 2, False) + target_label_cls = roialign2(label_maps, torch.cat( + [torch.arange(num_batches).view(num_batches, + 1).float().to(device), + batch_coords.float() * label_maps.size(3) - 0.5], 1)) + B,C,H,W = target_label.shape + target_label = target_label.view(B,C,H*W) + target_label = torch.cat([target_label_cls.view(B,C,1),target_label],dim=2) + target_label = torch.nn.functional.softmax(target_label.squeeze(), 1) + return target_label + +def get_labelmaps_with_coords(label_maps_topk, num_classes, on_value=1., off_value=0.,label_size=1, device='cuda'): + ''' + Adapted from https://github.com/naver-ai/relabel_imagenet/blob/main/utils/relabel_functions.py + Generate the target label map for training from the given bbox and raw label map + ''' + # trick to get coords_map from label_map + random_crop_coords = label_maps_topk[:,2,0,0,:4].view(-1, 4) + random_crop_coords[:, 2:] += random_crop_coords[:, :2] + random_crop_coords = random_crop_coords.to(device) + + # trick to get ground truth from label_map + ground_truth = label_maps_topk[:,2,0,0,5].view(-1).to(dtype=torch.int64) + ground_truth = one_hot(ground_truth, num_classes, on_value=on_value, off_value=off_value, device=device) + + # get full label maps from raw topk labels + label_maps = get_featuremaps(label_maps_topk=label_maps_topk, + num_classes=num_classes,device=device) + + # get token-level label and ground truth + token_label = get_label(label_maps=label_maps, + batch_coords=random_crop_coords, + label_size=label_size, + device=device) + B,C = token_label.shape[:2] + token_label = token_label*on_value+off_value + if label_size==1: + return torch.cat([ground_truth.view(B,C,1),token_label.view(B,C,1)],dim=2) + else: + return torch.cat([ground_truth.view(B,C,1),token_label],dim=2) + + +def mixup_target(target, num_classes, lam=1., smoothing=0.0, device='cuda', label_size=1): + ''' + generate and mix target from the given label maps + target: label maps/ label maps with coords + num_classes: number of classes for the target + lam: lambda for mixup target + ''' + off_value = smoothing / num_classes + on_value = 1. - smoothing + off_value + if len(target.size())>2: + if target.size(1)==3: + y1 = get_labelmaps_with_coords(target, num_classes, on_value=on_value, off_value=off_value, device=device, label_size=label_size) + y2 = y1.flip(0) + # y2 = get_labelmaps_with_coords(target.flip(0), num_classes, on_value=on_value, off_value=off_value, device=device, label_size=label_size) + else: + raise ValueError("Not supported label type") + else: + y1 = one_hot(target, num_classes, on_value=on_value, off_value=off_value, device=device) + y2 = one_hot(target.flip(0), num_classes, on_value=on_value, off_value=off_value, device=device) + + return y1 * lam + y2 * (1. - lam) + + +def rand_bbox(img_shape, lam, margin=0., count=None): + """ Standard CutMix bounding-box + Generates a random square bbox based on lambda value. This impl includes + support for enforcing a border margin as percent of bbox dimensions. + + Args: + img_shape (tuple): Image shape as tuple + lam (float): Cutmix lambda value + margin (float): Percentage of bbox dimension to enforce as margin (reduce amount of box outside image) + count (int): Number of bbox to generate + """ + ratio = np.sqrt(1 - lam) + img_h, img_w = img_shape[-2:] + cut_h, cut_w = int(img_h * ratio), int(img_w * ratio) + margin_y, margin_x = int(margin * cut_h), int(margin * cut_w) + cy = np.random.randint(0 + margin_y, img_h - margin_y, size=count) + cx = np.random.randint(0 + margin_x, img_w - margin_x, size=count) + yl = np.clip(cy - cut_h // 2, 0, img_h) + yh = np.clip(cy + cut_h // 2, 0, img_h) + xl = np.clip(cx - cut_w // 2, 0, img_w) + xh = np.clip(cx + cut_w // 2, 0, img_w) + return yl, yh, xl, xh + + +def rand_bbox_minmax(img_shape, minmax, count=None): + """ Min-Max CutMix bounding-box + Inspired by Darknet cutmix impl, generates a random rectangular bbox + based on min/max percent values applied to each dimension of the input image. + + Typical defaults for minmax are usually in the .2-.3 for min and .8-.9 range for max. + + Args: + img_shape (tuple): Image shape as tuple + minmax (tuple or list): Min and max bbox ratios (as percent of image size) + count (int): Number of bbox to generate + """ + assert len(minmax) == 2 + img_h, img_w = img_shape[-2:] + cut_h = np.random.randint(int(img_h * minmax[0]), int(img_h * minmax[1]), size=count) + cut_w = np.random.randint(int(img_w * minmax[0]), int(img_w * minmax[1]), size=count) + yl = np.random.randint(0, img_h - cut_h, size=count) + xl = np.random.randint(0, img_w - cut_w, size=count) + yu = yl + cut_h + xu = xl + cut_w + return yl, yu, xl, xu + + +def cutmix_bbox_and_lam(img_shape, lam, ratio_minmax=None, correct_lam=True, count=None): + """ Generate bbox and apply lambda correction. + """ + if ratio_minmax is not None: + yl, yu, xl, xu = rand_bbox_minmax(img_shape, ratio_minmax, count=count) + else: + yl, yu, xl, xu = rand_bbox(img_shape, lam, count=count) + if correct_lam or ratio_minmax is not None: + bbox_area = (yu - yl) * (xu - xl) + lam = 1. - bbox_area / float(img_shape[-2] * img_shape[-1]) + return (yl, yu, xl, xu), lam + + +class TokenLabelMixup: + """ Mixup/Cutmix with label that applies different params to each element or whole batch + Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/mixup.py + + Args: + mixup_alpha (float): mixup alpha value, mixup is active if > 0. + cutmix_alpha (float): cutmix alpha value, cutmix is active if > 0. + cutmix_minmax (List[float]): cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None. + prob (float): probability of applying mixup or cutmix per batch or element + switch_prob (float): probability of switching to cutmix instead of mixup when both are active + mode (str): how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element) + correct_lam (bool): apply lambda correction when cutmix bbox clipped by image borders + label_smoothing (float): apply label smoothing to the mixed target tensor + num_classes (int): number of classes for target + label_size (int): target label size + """ + def __init__(self, mixup_alpha=1., cutmix_alpha=0., cutmix_minmax=None, prob=1.0, switch_prob=0.5, + mode='batch', correct_lam=True, label_smoothing=0.1, num_classes=1000, label_size=1): + self.mixup_alpha = mixup_alpha + self.cutmix_alpha = cutmix_alpha + self.cutmix_minmax = cutmix_minmax + if self.cutmix_minmax is not None: + assert len(self.cutmix_minmax) == 2 + # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe + self.cutmix_alpha = 1.0 + self.mix_prob = prob + self.switch_prob = switch_prob + self.label_smoothing = label_smoothing + self.num_classes = num_classes + self.mode = mode + self.correct_lam = correct_lam # correct lambda based on clipped area for cutmix + self.mixup_enabled = True # set to false to disable mixing (intended tp be set by train loop) + self.label_size=label_size + + def _params_per_elem(self, batch_size): + lam = np.ones(batch_size, dtype=np.float32) + use_cutmix = np.zeros(batch_size, dtype=np.bool) + if self.mixup_enabled: + if self.mixup_alpha > 0. and self.cutmix_alpha > 0.: + use_cutmix = np.random.rand(batch_size) < self.switch_prob + lam_mix = np.where( + use_cutmix, + np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size), + np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size)) + elif self.mixup_alpha > 0.: + lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size) + elif self.cutmix_alpha > 0.: + use_cutmix = np.ones(batch_size, dtype=np.bool) + lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size) + else: + assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true." + lam = np.where(np.random.rand(batch_size) < self.mix_prob, lam_mix.astype(np.float32), lam) + return lam, use_cutmix + + def _params_per_batch(self): + lam = 1. + use_cutmix = False + if self.mixup_enabled and np.random.rand() < self.mix_prob: + if self.mixup_alpha > 0. and self.cutmix_alpha > 0.: + use_cutmix = np.random.rand() < self.switch_prob + lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) if use_cutmix else \ + np.random.beta(self.mixup_alpha, self.mixup_alpha) + elif self.mixup_alpha > 0.: + lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha) + elif self.cutmix_alpha > 0.: + use_cutmix = True + lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) + else: + assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true." + lam = float(lam_mix) + return lam, use_cutmix + + def _mix_elem(self, x): + batch_size = len(x) + lam_batch, use_cutmix = self._params_per_elem(batch_size) + x_orig = x.clone() # need to keep an unmodified original for mixing source + for i in range(batch_size): + j = batch_size - i - 1 + lam = lam_batch[i] + if lam != 1.: + if use_cutmix[i]: + (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( + x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) + x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh] + lam_batch[i] = lam + else: + x[i] = x[i] * lam + x_orig[j] * (1 - lam) + return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1) + + def _mix_pair(self, x): + batch_size = len(x) + lam_batch, use_cutmix = self._params_per_elem(batch_size // 2) + x_orig = x.clone() # need to keep an unmodified original for mixing source + for i in range(batch_size // 2): + j = batch_size - i - 1 + lam = lam_batch[i] + if lam != 1.: + if use_cutmix[i]: + (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( + x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) + x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh] + x[j][:, yl:yh, xl:xh] = x_orig[i][:, yl:yh, xl:xh] + lam_batch[i] = lam + else: + x[i] = x[i] * lam + x_orig[j] * (1 - lam) + x[j] = x[j] * lam + x_orig[i] * (1 - lam) + lam_batch = np.concatenate((lam_batch, lam_batch[::-1])) + return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1) + + def _mix_batch(self, x): + lam, use_cutmix = self._params_per_batch() + if lam == 1.: + return 1. + if use_cutmix: + (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( + x.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) + x[:, :, yl:yh, xl:xh] = x.flip(0)[:, :, yl:yh, xl:xh] + else: + x_flipped = x.flip(0).mul_(1. - lam) + x.mul_(lam).add_(x_flipped) + return lam + + def __call__(self, x, target): + assert len(x) % 2 == 0, 'Batch size should be even when using this' + if self.mode == 'elem': + lam = self._mix_elem(x) + elif self.mode == 'pair': + lam = self._mix_pair(x) + else: + lam = self._mix_batch(x) + target = mixup_target(target, self.num_classes, lam, self.label_smoothing, label_size=self.label_size) + return x, target + + +class FastCollateTokenLabelMixup(TokenLabelMixup): + """ Fast Collate w/ Mixup/Cutmix with label that applies different params to each element or whole batch + Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/mixup.py + + A Mixup impl that's performed while collating the batches. + """ + + def _mix_elem_collate(self, output, batch, half=False): + batch_size = len(batch) + num_elem = batch_size // 2 if half else batch_size + assert len(output) == num_elem + lam_batch, use_cutmix = self._params_per_elem(num_elem) + for i in range(num_elem): + j = batch_size - i - 1 + lam = lam_batch[i] + mixed = batch[i][0] + if lam != 1.: + if use_cutmix[i]: + if not half: + mixed = mixed.copy() + (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( + output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) + mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh] + lam_batch[i] = lam + else: + mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam) + np.rint(mixed, out=mixed) + output[i] += torch.from_numpy(mixed.astype(np.uint8)) + if half: + lam_batch = np.concatenate((lam_batch, np.ones(num_elem))) + return torch.tensor(lam_batch).unsqueeze(1) + + def _mix_pair_collate(self, output, batch): + batch_size = len(batch) + lam_batch, use_cutmix = self._params_per_elem(batch_size // 2) + for i in range(batch_size // 2): + j = batch_size - i - 1 + lam = lam_batch[i] + mixed_i = batch[i][0] + mixed_j = batch[j][0] + assert 0 <= lam <= 1.0 + if lam < 1.: + if use_cutmix[i]: + (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( + output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) + patch_i = mixed_i[:, yl:yh, xl:xh].copy() + mixed_i[:, yl:yh, xl:xh] = mixed_j[:, yl:yh, xl:xh] + mixed_j[:, yl:yh, xl:xh] = patch_i + lam_batch[i] = lam + else: + mixed_temp = mixed_i.astype(np.float32) * lam + mixed_j.astype(np.float32) * (1 - lam) + mixed_j = mixed_j.astype(np.float32) * lam + mixed_i.astype(np.float32) * (1 - lam) + mixed_i = mixed_temp + np.rint(mixed_j, out=mixed_j) + np.rint(mixed_i, out=mixed_i) + output[i] += torch.from_numpy(mixed_i.astype(np.uint8)) + output[j] += torch.from_numpy(mixed_j.astype(np.uint8)) + lam_batch = np.concatenate((lam_batch, lam_batch[::-1])) + return torch.tensor(lam_batch).unsqueeze(1) + + def _mix_batch_collate(self, output, batch): + batch_size = len(batch) + lam, use_cutmix = self._params_per_batch() + if use_cutmix: + (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( + output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) + for i in range(batch_size): + j = batch_size - i - 1 + mixed = batch[i][0] + if lam != 1.: + if use_cutmix: + mixed = mixed.copy() # don't want to modify the original while iterating + mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh] + else: + mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam) + np.rint(mixed, out=mixed) + output[i] += torch.from_numpy(mixed.astype(np.uint8)) + return lam + + def __call__(self, batch, _=None): + batch_size = len(batch) + assert batch_size % 2 == 0, 'Batch size should be even when using this' + half = 'half' in self.mode + if half: + batch_size //= 2 + output = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8) + if self.mode == 'elem' or self.mode == 'half': + lam = self._mix_elem_collate(output, batch, half=half) + elif self.mode == 'pair': + lam = self._mix_pair_collate(output, batch) + else: + lam = self._mix_batch_collate(output, batch) + + if type(batch[0][1])==type(0): + target = torch.tensor([b[1] for b in batch], dtype=torch.int64) + else: + target = torch.stack([b[1] for b in batch],0) + target = mixup_target(target, self.num_classes, lam, self.label_smoothing, device='cpu',label_size=self.label_size) + target = target[:batch_size] + return output, target diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/data/random_augment_label.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/random_augment_label.py new file mode 100644 index 0000000000000000000000000000000000000000..765cf61048f7f20ee50534d4d98d5139fff53013 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/random_augment_label.py @@ -0,0 +1,576 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +""" +Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/auto_augment.py and modified for token labeling +AutoAugment, RandAugment +""" +import random +import math +import re +from PIL import Image, ImageOps, ImageEnhance, ImageChops +import PIL +import numpy as np +from scipy import ndimage +import torch + +_PIL_VER = tuple([int(x) for x in PIL.__version__.split('.')[:2]]) + +_FILL = (128, 128, 128) + + + +# This signifies the max integer that the controller RNN could predict for the +# augmentation scheme. +_MAX_LEVEL = 10. + +_HPARAMS_DEFAULT = dict( + translate_const=250, + img_mean=_FILL, +) + +_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC) + + +def _interpolation(kwargs): + interpolation = kwargs.pop('resample', Image.BILINEAR) + if isinstance(interpolation, (list, tuple)): + return random.choice(interpolation) + else: + return interpolation + +def affine_label(label, matrix): + + # label: 2, k, H, W + # label[0] value, label[1] index + a,b,c,d,e,f = matrix + affine_matrix = [[1,0,0,0],[0,a,b,c],[0,d,e,f]] + value = ndimage.affine_transform(label[0],matrix=affine_matrix, order=0, mode="constant") + index = ndimage.affine_transform(label[1],matrix=affine_matrix, order=0, mode="nearest") + + return torch.from_numpy(np.stack([value, index],axis=0)) + +def _check_args_tf(kwargs): + if 'fillcolor' in kwargs and _PIL_VER < (5, 0): + kwargs.pop('fillcolor') + kwargs['resample'] = _interpolation(kwargs) + + +def shear_x(img, factor, **kwargs): + _check_args_tf(kwargs) + return img.transform(img.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), **kwargs) + +def shear_y_label(label, factor): + return affine_label(label, (1, factor, 0, 0, 1, 0)) + + +def shear_y(img, factor, **kwargs): + _check_args_tf(kwargs) + return img.transform(img.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), **kwargs) + +def shear_x_label(label, factor): + return affine_label(label, (1, 0, 0, factor, 1, 0)) + +def translate_x_rel(img, pct, **kwargs): + pixels = pct * img.size[0] + _check_args_tf(kwargs) + return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs) + +def translate_y_rel_label(label, pct): + pixels = pct * label.size(2) + return affine_label(label, (1, 0, pixels, 0, 1, 0)) + + +def translate_y_rel(img, pct, **kwargs): + pixels = pct * img.size[1] + _check_args_tf(kwargs) + return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs) + +def translate_x_rel_label(label, pct): + pixels = pct * label.size(3) + return affine_label(label, (1, 0, 0, 0, 1, pixels)) + + +def translate_x_abs(img, pixels, **kwargs): + _check_args_tf(kwargs) + return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs) + + +def translate_y_abs(img, pixels, **kwargs): + _check_args_tf(kwargs) + return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs) + + +def rotate(img, degrees, **kwargs): + _check_args_tf(kwargs) + if _PIL_VER >= (5, 2): + return img.rotate(degrees, **kwargs) + elif _PIL_VER >= (5, 0): + w, h = img.size + post_trans = (0, 0) + rotn_center = (w / 2.0, h / 2.0) + angle = -math.radians(degrees) + matrix = [ + round(math.cos(angle), 15), + round(math.sin(angle), 15), + 0.0, + round(-math.sin(angle), 15), + round(math.cos(angle), 15), + 0.0, + ] + + def transform(x, y, matrix): + (a, b, c, d, e, f) = matrix + return a * x + b * y + c, d * x + e * y + f + + matrix[2], matrix[5] = transform( + -rotn_center[0] - post_trans[0], -rotn_center[1] - post_trans[1], matrix + ) + matrix[2] += rotn_center[0] + matrix[5] += rotn_center[1] + return img.transform(img.size, Image.AFFINE, matrix, **kwargs) + else: + return img.rotate(degrees, resample=kwargs['resample']) + +def rotate_label(label, degrees): + _,_, w, h = label.size() + post_trans = (0, 0) + rotn_center = (w / 2.0, h / 2.0) + angle = math.radians(degrees) + matrix = [ + round(math.cos(angle), 15), + round(math.sin(angle), 15), + 0.0, + round(-math.sin(angle), 15), + round(math.cos(angle), 15), + 0.0, + ] + + def transform(x, y, matrix): + (a, b, c, d, e, f) = matrix + return a * x + b * y + c, d * x + e * y + f + + matrix[2], matrix[5] = transform( + -rotn_center[0] - post_trans[0], -rotn_center[1] - post_trans[1], matrix + ) + matrix[2] += rotn_center[0] + matrix[5] += rotn_center[1] + return affine_label(label, matrix) + + +def auto_contrast(img, **__): + return ImageOps.autocontrast(img) + + +def invert(img, **__): + return ImageOps.invert(img) + + +def equalize(img, **__): + return ImageOps.equalize(img) + + +def solarize(img, thresh, **__): + return ImageOps.solarize(img, thresh) + + +def solarize_add(img, add, thresh=128, **__): + lut = [] + for i in range(256): + if i < thresh: + lut.append(min(255, i + add)) + else: + lut.append(i) + if img.mode in ("L", "RGB"): + if img.mode == "RGB" and len(lut) == 256: + lut = lut + lut + lut + return img.point(lut) + else: + return img + + +def posterize(img, bits_to_keep, **__): + if bits_to_keep >= 8: + return img + return ImageOps.posterize(img, bits_to_keep) + + +def contrast(img, factor, **__): + return ImageEnhance.Contrast(img).enhance(factor) + + +def color(img, factor, **__): + return ImageEnhance.Color(img).enhance(factor) + + +def brightness(img, factor, **__): + return ImageEnhance.Brightness(img).enhance(factor) + + +def sharpness(img, factor, **__): + return ImageEnhance.Sharpness(img).enhance(factor) + + +def _randomly_negate(v): + """With 50% prob, negate the value""" + return -v if random.random() > 0.5 else v + + +def _rotate_level_to_arg(level, _hparams): + # range [-30, 30] + level = (level / _MAX_LEVEL) * 30. + level = _randomly_negate(level) + return level, + + +def _enhance_level_to_arg(level, _hparams): + # range [0.1, 1.9] + return (level / _MAX_LEVEL) * 1.8 + 0.1, + + +def _enhance_increasing_level_to_arg(level, _hparams): + # the 'no change' level is 1.0, moving away from that towards 0. or 2.0 increases the enhancement blend + # range [0.1, 1.9] + level = (level / _MAX_LEVEL) * .9 + level = 1.0 + _randomly_negate(level) + return level, + + +def _shear_level_to_arg(level, _hparams): + # range [-0.3, 0.3] + level = (level / _MAX_LEVEL) * 0.3 + level = _randomly_negate(level) + return level, + + +def _translate_abs_level_to_arg(level, hparams): + translate_const = hparams['translate_const'] + level = (level / _MAX_LEVEL) * float(translate_const) + level = _randomly_negate(level) + return level, + + +def _translate_rel_level_to_arg(level, hparams): + # default range [-0.45, 0.45] + translate_pct = hparams.get('translate_pct', 0.45) + level = (level / _MAX_LEVEL) * translate_pct + level = _randomly_negate(level) + return level, + + +def _posterize_level_to_arg(level, _hparams): + # As per Tensorflow TPU EfficientNet impl + # range [0, 4], 'keep 0 up to 4 MSB of original image' + # intensity/severity of augmentation decreases with level + return int((level / _MAX_LEVEL) * 4), + + +def _posterize_increasing_level_to_arg(level, hparams): + # As per Tensorflow models research and UDA impl + # range [4, 0], 'keep 4 down to 0 MSB of original image', + # intensity/severity of augmentation increases with level + return 4 - _posterize_level_to_arg(level, hparams)[0], + + +def _posterize_original_level_to_arg(level, _hparams): + # As per original AutoAugment paper description + # range [4, 8], 'keep 4 up to 8 MSB of image' + # intensity/severity of augmentation decreases with level + return int((level / _MAX_LEVEL) * 4) + 4, + + +def _solarize_level_to_arg(level, _hparams): + # range [0, 256] + # intensity/severity of augmentation decreases with level + return int((level / _MAX_LEVEL) * 256), + + +def _solarize_increasing_level_to_arg(level, _hparams): + # range [0, 256] + # intensity/severity of augmentation increases with level + return 256 - _solarize_level_to_arg(level, _hparams)[0], + + +def _solarize_add_level_to_arg(level, _hparams): + # range [0, 110] + return int((level / _MAX_LEVEL) * 110), + +class AugmentOp: + + def __init__(self, name, prob=0.5, magnitude=10, hparams=None): + hparams = hparams or _HPARAMS_DEFAULT + self.name = name + self.aug_fn = NAME_TO_OP[name] + self.label_fn = NAME_TO_LABELOP[name] + self.level_fn = LEVEL_TO_ARG[name] + self.prob = prob + self.magnitude = magnitude + self.hparams = hparams.copy() + self.kwargs = dict( + fillcolor=hparams['img_mean'] if 'img_mean' in hparams else _FILL, + resample=hparams['interpolation'] if 'interpolation' in hparams else _RANDOM_INTERPOLATION, + ) + + # If magnitude_std is > 0, we introduce some randomness + # in the usually fixed policy and sample magnitude from a normal distribution + # with mean `magnitude` and std-dev of `magnitude_std`. + # NOTE This is my own hack, being tested, not in papers or reference impls. + self.magnitude_std = self.hparams.get('magnitude_std', 0) + + def __call__(self, img, label): + if self.prob < 1.0 and random.random() > self.prob: + return img, label + magnitude = self.magnitude + if self.magnitude_std and self.magnitude_std > 0: + magnitude = random.gauss(magnitude, self.magnitude_std) + magnitude = min(_MAX_LEVEL, max(0, magnitude)) # clip to valid range + level_args = self.level_fn(magnitude, self.hparams) if self.level_fn is not None else tuple() + if self.label_fn is not None: + + aug_label = self.label_fn(label, *level_args) + else: + aug_label = label + return self.aug_fn(img, *level_args, **self.kwargs), aug_label + +LEVEL_TO_ARG = { + 'AutoContrast': None, + 'Equalize': None, + 'Invert': None, + 'Rotate': _rotate_level_to_arg, + # There are several variations of the posterize level scaling in various Tensorflow/Google repositories/papers + 'Posterize': _posterize_level_to_arg, + 'PosterizeIncreasing': _posterize_increasing_level_to_arg, + 'PosterizeOriginal': _posterize_original_level_to_arg, + 'Solarize': _solarize_level_to_arg, + 'SolarizeIncreasing': _solarize_increasing_level_to_arg, + 'SolarizeAdd': _solarize_add_level_to_arg, + 'Color': _enhance_level_to_arg, + 'ColorIncreasing': _enhance_increasing_level_to_arg, + 'Contrast': _enhance_level_to_arg, + 'ContrastIncreasing': _enhance_increasing_level_to_arg, + 'Brightness': _enhance_level_to_arg, + 'BrightnessIncreasing': _enhance_increasing_level_to_arg, + 'Sharpness': _enhance_level_to_arg, + 'SharpnessIncreasing': _enhance_increasing_level_to_arg, + 'ShearX': _shear_level_to_arg, + 'ShearY': _shear_level_to_arg, + 'TranslateX': _translate_abs_level_to_arg, + 'TranslateY': _translate_abs_level_to_arg, + 'TranslateXRel': _translate_rel_level_to_arg, + 'TranslateYRel': _translate_rel_level_to_arg, +} + + +NAME_TO_OP = { + 'AutoContrast': auto_contrast, + 'Equalize': equalize, + 'Invert': invert, + 'Rotate': rotate, + 'Posterize': posterize, + 'PosterizeIncreasing': posterize, + 'PosterizeOriginal': posterize, + 'Solarize': solarize, + 'SolarizeIncreasing': solarize, + 'SolarizeAdd': solarize_add, + 'Color': color, + 'ColorIncreasing': color, + 'Contrast': contrast, + 'ContrastIncreasing': contrast, + 'Brightness': brightness, + 'BrightnessIncreasing': brightness, + 'Sharpness': sharpness, + 'SharpnessIncreasing': sharpness, + 'ShearX': shear_x, + 'ShearY': shear_y, + 'TranslateX': translate_x_abs, + 'TranslateY': translate_y_abs, + 'TranslateXRel': translate_x_rel, + 'TranslateYRel': translate_y_rel, +} +# Remove TranslateX and TranslateY here since it is actually not used in random aug +# Only spatial op should be applied to the label map +NAME_TO_LABELOP = { + 'AutoContrast': None, + 'Equalize': None, + 'Invert': None, + 'Rotate': rotate_label, + 'Posterize': None, + 'PosterizeIncreasing': None, + 'PosterizeOriginal': None, + 'Solarize': None, + 'SolarizeIncreasing': None, + 'SolarizeAdd': None, + 'Color': None, + 'ColorIncreasing': None, + 'Contrast': None, + 'ContrastIncreasing': None, + 'Brightness': None, + 'BrightnessIncreasing': None, + 'Sharpness': None, + 'SharpnessIncreasing': None, + 'ShearX': shear_x_label, + 'ShearY': shear_y_label, + 'TranslateX': None, + 'TranslateY': None, + 'TranslateXRel': translate_x_rel_label, + 'TranslateYRel': translate_y_rel_label, +} + + +_RAND_TRANSFORMS = [ + 'AutoContrast', + 'Equalize', + 'Invert', + 'Rotate', + 'Posterize', + 'Solarize', + 'SolarizeAdd', + 'Color', + 'Contrast', + 'Brightness', + 'Sharpness', + 'ShearX', + 'ShearY', + 'TranslateXRel', + 'TranslateYRel', + #'Cutout' +] + + +_RAND_INCREASING_TRANSFORMS = [ + 'AutoContrast', + 'Equalize', + 'Invert', + 'Rotate', + 'PosterizeIncreasing', + 'SolarizeIncreasing', + 'SolarizeAdd', + 'ColorIncreasing', + 'ContrastIncreasing', + 'BrightnessIncreasing', + 'SharpnessIncreasing', + 'ShearX', + 'ShearY', + 'TranslateXRel', + 'TranslateYRel', + #'Cutout' +] + + + +# These experimental weights are based loosely on the relative improvements mentioned in paper. +# They may not result in increased performance, but could likely be tuned to so. +_RAND_CHOICE_WEIGHTS_0 = { + 'Rotate': 0.3, + 'ShearX': 0.2, + 'ShearY': 0.2, + 'TranslateXRel': 0.1, + 'TranslateYRel': 0.1, + 'Color': .025, + 'Sharpness': 0.025, + 'AutoContrast': 0.025, + 'Solarize': .005, + 'SolarizeAdd': .005, + 'Contrast': .005, + 'Brightness': .005, + 'Equalize': .005, + 'Posterize': 0, + 'Invert': 0, +} + + +def _select_rand_weights(weight_idx=0, transforms=None): + transforms = transforms or _RAND_TRANSFORMS + assert weight_idx == 0 # only one set of weights currently + rand_weights = _RAND_CHOICE_WEIGHTS_0 + probs = [rand_weights[k] for k in transforms] + probs /= np.sum(probs) + return probs + + +def rand_augment_ops(magnitude=10, hparams=None, transforms=None): + hparams = hparams or _HPARAMS_DEFAULT + transforms = transforms or _RAND_TRANSFORMS + return [AugmentOp( + name, prob=0.5, magnitude=magnitude, hparams=hparams) for name in transforms] + + +class RandAugment: + ''' + Apply RandAug on both image and dense label map + ''' + def __init__(self, ops, num_layers=2, choice_weights=None): + self.ops = ops + self.num_layers = num_layers + self.choice_weights = choice_weights + + def __call__(self, img, label): + # no replacement when using weighted choice + ops = np.random.choice( + self.ops, self.num_layers, replace=self.choice_weights is None, p=self.choice_weights) + for op in ops: + img, label = op(img, label) + return img, label + + +def rand_augment_transform(config_str, hparams): + """ + Create a RandAugment transform with label + :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by + dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining + sections, not order sepecific determine + 'm' - integer magnitude of rand augment + 'n' - integer num layers (number of transform ops selected per image) + 'w' - integer probabiliy weight index (index of a set of weights to influence choice of op) + 'mstd' - float std deviation of magnitude noise applied + 'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0) + Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5 + 'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2 + + :param hparams: Other hparams (kwargs) for the RandAugmentation scheme + + :return: A PyTorch compatible Transform + """ + magnitude = _MAX_LEVEL # default to _MAX_LEVEL for magnitude (currently 10) + num_layers = 2 # default to 2 ops per image + weight_idx = None # default to no probability weights for op choice + transforms = _RAND_TRANSFORMS + config = config_str.split('-') + assert config[0] == 'rand' + config = config[1:] + for c in config: + cs = re.split(r'(\d.*)', c) + if len(cs) < 2: + continue + key, val = cs[:2] + if key == 'mstd': + # noise param injected via hparams for now + hparams.setdefault('magnitude_std', float(val)) + elif key == 'inc': + if bool(val): + transforms = _RAND_INCREASING_TRANSFORMS + elif key == 'm': + magnitude = int(val) + elif key == 'n': + num_layers = int(val) + elif key == 'w': + weight_idx = int(val) + else: + assert False, 'Unknown RandAugment config section' + ra_ops = rand_augment_ops(magnitude=magnitude, hparams=hparams, transforms=transforms) + choice_weights = None if weight_idx is None else _select_rand_weights(weight_idx) + return RandAugment(ra_ops, num_layers, choice_weights=choice_weights) + diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/loss/__init__.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/loss/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1544c017967c2101d374786271ccd15ae542b02b --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/loss/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +from .cross_entropy import TokenLabelCrossEntropy,TokenLabelSoftTargetCrossEntropy \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/loss/cross_entropy.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/loss/cross_entropy.py new file mode 100644 index 0000000000000000000000000000000000000000..5615b9a0cb07d29306b933436d463894fc98c6c0 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/loss/cross_entropy.py @@ -0,0 +1,99 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + +class SoftTargetCrossEntropy(nn.Module): + + def __init__(self): + super(SoftTargetCrossEntropy, self).__init__() + + def forward(self, x, target): + N_rep = x.shape[0] + N = target.shape[0] + if not N==N_rep: + target = target.repeat(N_rep//N,1) + loss = torch.sum(-target * F.log_softmax(x, dim=-1), dim=-1) + return loss.mean() + +class TokenLabelSoftTargetCrossEntropy(nn.Module): + + def __init__(self): + super(TokenLabelSoftTargetCrossEntropy, self).__init__() + + def forward(self, x, target): + N_rep = x.shape[0] + N = target.shape[0] + if not N==N_rep: + target = target.repeat(N_rep//N,1) + if len(target.shape)==3 and target.shape[-1]==2: + ground_truth=target[:,:,0] + target = target[:,:,1] + loss = torch.sum(-target * F.log_softmax(x, dim=-1), dim=-1) + return loss.mean() + +class TokenLabelCrossEntropy(nn.Module): + """ + Token labeling loss. + """ + def __init__(self, dense_weight=1.0, cls_weight = 1.0, mixup_active=True, classes = 1000, ground_truth = False): + """ + Constructor Token labeling loss. + """ + super(TokenLabelCrossEntropy, self).__init__() + + + self.CE = SoftTargetCrossEntropy() + + self.dense_weight = dense_weight + self.mixup_active = mixup_active + self.classes = classes + self.cls_weight = cls_weight + self.ground_truth = ground_truth + assert dense_weight+cls_weight>0 + + + def forward(self, x, target): + + output, aux_output, bb = x + bbx1, bby1, bbx2, bby2 = bb + + B,N,C = aux_output.shape + if len(target.shape)==2: + target_cls=target + target_aux = target.repeat(1,N).reshape(B*N,C) + else: + target_cls = target[:,:,1] + if self.ground_truth: + # use ground truth to help correct label. + # rely more on ground truth if target_cls is incorrect. + ground_truth = target[:,:,0] + ratio = (0.9 - 0.4 * (ground_truth.max(-1)[1] == target_cls.max(-1)[1])).unsqueeze(-1) + target_cls = target_cls * ratio + ground_truth * (1 - ratio) + target_aux = target[:,:,2:] + target_aux = target_aux.transpose(1,2).reshape(-1,C) + lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / N) + if lam<1: + target_cls = lam*target_cls + (1-lam)*target_cls.flip(0) + + aux_output = aux_output.reshape(-1,C) + loss_cls = self.CE(output, target_cls) + loss_aux = self.CE(aux_output, target_aux) + return self.cls_weight*loss_cls+self.dense_weight* loss_aux + diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/models/__init__.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..61b4a019c51f7dc97e0728b4ee2519a393e275f5 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/models/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +from .lvvit import * diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/models/layers.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/models/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..42221bac677ed1e6c2ce7fe16e003d51bdfca536 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/models/layers.py @@ -0,0 +1,437 @@ + +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +import torch +import torch.nn as nn +import numpy as np +from functools import partial +import torch.nn.init as init +import torch.nn.functional as F +import math +from timm.models.layers import DropPath, to_2tuple + +DROPOUT_FLOPS = 4 +LAYER_NORM_FLOPS = 5 +ACTIVATION_FLOPS = 8 +SOFTMAX_FLOPS = 5 + +class GroupLinear(nn.Module): + ''' + Group Linear operator + ''' + def __init__(self, in_planes, out_channels,groups=1, bias=True): + super(GroupLinear, self).__init__() + assert in_planes%groups==0 + assert out_channels%groups==0 + self.in_dim = in_planes + self.out_dim = out_channels + self.groups=groups + self.bias = bias + self.group_in_dim = int(self.in_dim/self.groups) + self.group_out_dim = int(self.out_dim/self.groups) + + self.group_weight = nn.Parameter(torch.zeros(self.groups, self.group_in_dim, self.group_out_dim)) + self.group_bias=nn.Parameter(torch.zeros(self.out_dim)) + + def forward(self, x): + t,b,d=x.size() + x = x.view(t,b,self.groups,int(d/self.groups)) + out = torch.einsum('tbgd,gdf->tbgf', (x, self.group_weight)).reshape(t,b,self.out_dim)+self.group_bias + return out + def extra_repr(self): + s = ('{in_dim}, {out_dim}') + if self.groups != 1: + s += ', groups={groups}' + if self.bias is None: + s += ', bias=False' + return s.format(**self.__dict__) + + +class Mlp(nn.Module): + ''' + MLP with support to use group linear operator + ''' + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., group=1): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + if group==1: + self.fc1 = nn.Linear(in_features, hidden_features) + self.fc2 = nn.Linear(hidden_features, out_features) + else: + self.fc1 = GroupLinear(in_features, hidden_features,group) + self.fc2 = GroupLinear(hidden_features, out_features,group) + self.act = act_layer() + + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + +class GroupNorm(nn.Module): + def __init__(self, num_groups, embed_dim, eps=1e-5, affine=True): + super().__init__() + self.gn = nn.GroupNorm(num_groups, embed_dim,eps,affine) + + def forward(self, x): + B,T,C = x.shape + x = x.view(B*T,C) + x = self.gn(x) + x = x.view(B,T,C) + return x + + +class Attention(nn.Module): + ''' + Multi-head self-attention + from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py + with some modification to support different num_heads and head_dim. + ''' + def __init__(self, dim, num_heads=8, head_dim=None, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + if head_dim is not None: + self.head_dim=head_dim + else: + head_dim = dim // num_heads + self.head_dim = head_dim + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, self.head_dim* self.num_heads * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(self.head_dim* self.num_heads, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, padding_mask=None): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) + # B,heads,N,C/heads + q, k, v = qkv[0], qkv[1], qkv[2] + + # trick here to make q@k.t more stable + attn = ((q * self.scale) @ k.transpose(-2, -1)) + if padding_mask is not None: + attn = attn.view(B, self.num_heads, N, N) + attn = attn.masked_fill( + padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), + float("-inf"), + ) + attn_float = attn.softmax(dim=-1, dtype=torch.float32) + attn = attn_float.type_as(attn) + else: + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, self.head_dim* self.num_heads) + x = self.proj(x) + x = self.proj_drop(x) + return x + +class Block(nn.Module): + ''' + Pre-layernorm transformer block + ''' + def __init__(self, dim, num_heads, head_dim=None, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, group=1, skip_lam=1.): + super().__init__() + self.dim = dim + self.mlp_hidden_dim = int(dim * mlp_ratio) + self.skip_lam = skip_lam + + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, head_dim=head_dim, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + self.mlp = Mlp(in_features=dim, hidden_features=self.mlp_hidden_dim, act_layer=act_layer, drop=drop, group=group) + + def forward(self, x, padding_mask=None): + x = x + self.drop_path(self.attn(self.norm1(x),padding_mask))/self.skip_lam + x = x + self.drop_path(self.mlp(self.norm2(x)))/self.skip_lam + return x + + def flops(self, s): + heads = self.attn.num_heads + h = self.dim + i = self.mlp_hidden_dim + mha_block_flops = dict( + kqv=3 * h * h , + attention_scores=h * s, + attn_softmax=SOFTMAX_FLOPS * s * heads, + attention_dropout=DROPOUT_FLOPS * s * heads, + attention_scale=s * heads, + attention_weighted_avg_values=h * s, + attn_output=h * h, + attn_output_bias=h, + attn_output_dropout=DROPOUT_FLOPS * h, + attn_output_residual=h, + attn_output_layer_norm=LAYER_NORM_FLOPS * h,) + ffn_block_flops = dict( + intermediate=h * i, + intermediate_act=ACTIVATION_FLOPS * i, + intermediate_bias=i, + output=h * i, + output_bias=h, + output_dropout=DROPOUT_FLOPS * h, + output_residual=h, + output_layer_norm=LAYER_NORM_FLOPS * h,) + + return sum(mha_block_flops.values())*s + sum(ffn_block_flops.values())*s + +class MHABlock(nn.Module): + """ + Multihead Attention block with residual branch + """ + def __init__(self, dim, num_heads, head_dim=None, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, group=1, skip_lam=1.): + super().__init__() + self.dim = dim + self.norm1 = norm_layer(dim) + self.skip_lam = skip_lam + self.attn = Attention( + dim, num_heads=num_heads, head_dim=head_dim, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, x, padding_mask=None): + x = x + self.drop_path(self.attn(self.norm1(x*self.skip_lam), padding_mask))/self.skip_lam + return x + + def flops(self, s): + heads = self.attn.num_heads + h = self.dim + block_flops = dict( + kqv=3 * h * h , + attention_scores=h * s, + attn_softmax=SOFTMAX_FLOPS * s * heads, + attention_dropout=DROPOUT_FLOPS * s * heads, + attention_scale=s * heads, + attention_weighted_avg_values=h * s, + attn_output=h * h, + attn_output_bias=h, + attn_output_dropout=DROPOUT_FLOPS * h, + attn_output_residual=h, + attn_output_layer_norm=LAYER_NORM_FLOPS * h,) + + return sum(block_flops.values())*s + +class FFNBlock(nn.Module): + """ + Feed forward network with residual branch + """ + def __init__(self, dim, num_heads, head_dim=None, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, group=1, skip_lam=1.): + super().__init__() + self.skip_lam = skip_lam + self.dim = dim + self.mlp_hidden_dim = int(dim * mlp_ratio) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + self.mlp = Mlp(in_features=dim, hidden_features=self.mlp_hidden_dim, act_layer=act_layer, drop=drop, group=group) + def forward(self, x): + x = x + self.drop_path(self.mlp(self.norm2(x*self.skip_lam)))/self.skip_lam + return x + def flops(self, s): + heads = self.attn.num_heads + h = self.dim + i = self.mlp_hidden_dim + block_flops = dict( + intermediate=h * i, + intermediate_act=ACTIVATION_FLOPS * i, + intermediate_bias=i, + output=h * i, + output_bias=h, + output_dropout=DROPOUT_FLOPS * h, + output_residual=h, + output_layer_norm=LAYER_NORM_FLOPS * h,) + + return sum(block_flops.values())*s + +class HybridEmbed(nn.Module): + """ CNN Feature Map Embedding + Extract feature map from CNN, flatten, project to embedding dim. + from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py + """ + def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768): + super().__init__() + assert isinstance(backbone, nn.Module) + img_size = to_2tuple(img_size) + self.img_size = img_size + self.backbone = backbone + if feature_size is None: + with torch.no_grad(): + training = backbone.training + if training: + backbone.eval() + o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1] + feature_size = o.shape[-2:] + feature_dim = o.shape[1] + backbone.train(training) + else: + feature_size = to_2tuple(feature_size) + feature_dim = self.backbone.feature_info.channels()[-1] + self.num_patches = feature_size[0] * feature_size[1] + self.proj = nn.Conv2d(feature_dim, embed_dim,kernel_size=1) + + def forward(self, x): + x = self.backbone(x)[-1] + x = self.proj(x) + return x + + +class PatchEmbedNaive(nn.Module): + """ + Image to Patch Embedding + from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + self.embed_dim = embed_dim + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x): + B, C, H, W = x.shape + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x) + return x + + def flops(self): + img_size = self.img_size[0] + block_flops = dict( + proj=img_size*img_size*3*self.embed_dim, + ) + return sum(block_flops.values()) + + +class PatchEmbed4_2(nn.Module): + """ + Image to Patch Embedding with 4 layer convolution + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + + new_patch_size = to_2tuple(patch_size // 2) + + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + self.embed_dim = embed_dim + + self.conv1 = nn.Conv2d(in_chans, 64, kernel_size=7, stride=2, padding=3, bias=False) # 112x112 + self.bn1 = nn.BatchNorm2d(64) + self.relu = nn.ReLU(inplace=True) + self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False) # 112x112 + self.bn2 = nn.BatchNorm2d(64) + self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(64) + + self.proj = nn.Conv2d(64, embed_dim, kernel_size=new_patch_size, stride=new_patch_size) + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + + x = self.conv3(x) + x = self.bn3(x) + x = self.relu(x) + + x = self.proj(x) # [B, C, W, H] + + return x + + def flops(self): + img_size = self.img_size[0] + block_flops = dict( + conv1=img_size/2*img_size/2*3*64*7*7, + conv2=img_size/2*img_size/2*64*64*3*3, + conv3=img_size/2*img_size/2*64*64*3*3, + proj=img_size/2*img_size/2*64*self.embed_dim, + ) + return sum(block_flops.values()) + + +class PatchEmbed4_2_128(nn.Module): + """ + Image to Patch Embedding with 4 layer convolution and 128 filters + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + + new_patch_size = to_2tuple(patch_size // 2) + + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + self.embed_dim = embed_dim + + self.conv1 = nn.Conv2d(in_chans, 128, kernel_size=7, stride=2, padding=3, bias=False) # 112x112 + self.bn1 = nn.BatchNorm2d(128) + self.relu = nn.ReLU(inplace=True) + self.conv2 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1, bias=False) # 112x112 + self.bn2 = nn.BatchNorm2d(128) + self.conv3 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(128) + + self.proj = nn.Conv2d(128, embed_dim, kernel_size=new_patch_size, stride=new_patch_size) + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + + x = self.conv3(x) + x = self.bn3(x) + x = self.relu(x) + + x = self.proj(x) # [B, C, W, H] + + return x + def flops(self): + img_size = self.img_size[0] + block_flops = dict( + conv1=img_size/2*img_size/2*3*128*7*7, + conv2=img_size/2*img_size/2*128*128*3*3, + conv3=img_size/2*img_size/2*128*128*3*3, + proj=img_size/2*img_size/2*128*self.embed_dim, + ) + return sum(block_flops.values()) \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/models/lvvit.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/models/lvvit.py new file mode 100644 index 0000000000000000000000000000000000000000..9be9c2fb1dcd8635c061bbb4181775f5473ff84e --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/models/lvvit.py @@ -0,0 +1,298 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +import torch +import torch.nn as nn + +from timm.models.helpers import load_pretrained +from timm.models.registry import register_model +from timm.models.layers import trunc_normal_ +from timm.models.resnet import resnet26d, resnet50d, resnet101d +import numpy as np + +from .layers import * + + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', + 'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225), + 'classifier': 'head', + **kwargs + } + +default_cfgs = { + 'LV_ViT_Tiny': _cfg(), + 'LV_ViT': _cfg(), + 'LV_ViT_Medium': _cfg(crop_pct=1.0), + 'LV_ViT_Large': _cfg(crop_pct=1.0), +} + +def get_block(block_type, **kargs): + if block_type=='mha': + # multi-head attention block + return MHABlock(**kargs) + elif block_type=='ffn': + # feed forward block + return FFNBlock(**kargs) + elif block_type=='tr': + # transformer block + return Block(**kargs) + + +def rand_bbox(size, beta=1.0): + W = size[2] + H = size[3] + while True: + lam = np.random.beta(beta, beta) + cut_rat = np.sqrt(1. - lam) + cut_w = np.int(W * cut_rat) + cut_h = np.int(H * cut_rat) + + # uniform + cx = np.random.randint(W) + cy = np.random.randint(H) + + bbx1 = np.clip(cx - cut_w // 2, 0, W) + bby1 = np.clip(cy - cut_h // 2, 0, H) + bbx2 = np.clip(cx + cut_w // 2, 0, W) + bby2 = np.clip(cy + cut_h // 2, 0, H) + + if bbx1 != bbx2 and bby1 != bby2: + break + + return bbx1, bby1, bbx2, bby2 + + +def get_dpr(drop_path_rate,depth,drop_path_decay='linear'): + if drop_path_decay=='linear': + # linear dpr decay + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + elif drop_path_decay=='fix': + # use fixed dpr + dpr= [drop_path_rate]*depth + else: + # use predefined drop_path_rate list + assert len(drop_path_rate)==depth + dpr=drop_path_rate + return dpr + + +class LV_ViT(nn.Module): + """ Vision Transformer with tricks + Arguements: + p_emb: different conv based position embedding (default: 4 layer conv) + skip_lam: residual scalar for skip connection (default: 1.0) + order: which order of layers will be used (default: None, will override depth if given) + mix_token: use mix token augmentation for batch of tokens (default: False) + return_dense: whether to return feature of all tokens with an additional aux_head (default: False) + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., drop_path_decay='linear', hybrid_backbone=None, norm_layer=nn.LayerNorm, p_emb='4_2', head_dim = None, + skip_lam = 1.0,order=None, mix_token=False, return_dense=False): + super().__init__() + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.output_dim = embed_dim if num_classes==0 else num_classes + if hybrid_backbone is not None: + self.patch_embed = HybridEmbed( + hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim) + else: + if p_emb=='4_2': + patch_embed_fn = PatchEmbed4_2 + elif p_emb=='4_2_128': + patch_embed_fn = PatchEmbed4_2_128 + else: + patch_embed_fn = PatchEmbedNaive + + self.patch_embed = patch_embed_fn(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + self.pos_drop = nn.Dropout(p=drop_rate) + + if order is None: + dpr=get_dpr(drop_path_rate, depth, drop_path_decay) + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, head_dim=head_dim, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, skip_lam=skip_lam) + for i in range(depth)]) + else: + # use given order to sequentially generate modules + dpr=get_dpr(drop_path_rate, len(order), drop_path_decay) + self.blocks = nn.ModuleList([ + get_block(order[i], + dim=embed_dim, num_heads=num_heads, head_dim=head_dim, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, skip_lam=skip_lam) + for i in range(len(order))]) + + self.norm = norm_layer(embed_dim) + self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + self.return_dense=return_dense + self.mix_token=mix_token + + if return_dense: + self.aux_head=nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + if mix_token: + self.beta = 1.0 + assert return_dense, "always return all features when mixtoken is enabled" + + trunc_normal_(self.pos_embed, std=.02) + trunc_normal_(self.cls_token, std=.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, GroupLinear): + trunc_normal_(m.group_weight, std=.02) + if isinstance(m, GroupLinear) and m.group_bias is not None: + nn.init.constant_(m.group_bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=''): + self.num_classes = num_classes + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_embeddings(self,x): + x = self.patch_embed(x) + return x + def forward_tokens(self, x): + B = x.shape[0] + cls_tokens = self.cls_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + x = x + self.pos_embed + x = self.pos_drop(x) + for blk in self.blocks: + x = blk(x) + x = self.norm(x) + return x + + def forward_features(self,x): + # simple forward to obtain feature map (without mixtoken) + x = self.forward_embeddings(x) + x = x.flatten(2).transpose(1, 2) + x = self.forward_tokens(x) + return x + + def forward(self, x): + x = self.forward_embeddings(x) + + # token level mixtoken augmentation + if self.mix_token and self.training: + patch_h, patch_w = x.shape[2],x.shape[3] + bbx1, bby1, bbx2, bby2 = rand_bbox(x.size(), self.beta) + temp_x = x.clone() + temp_x[:, :, bbx1:bbx2, bby1:bby2] = x.flip(0)[:, :, bbx1:bbx2, bby1:bby2] + x = temp_x + else: + bbx1, bby1, bbx2, bby2 = 0,0,0,0 + + x = x.flatten(2).transpose(1, 2) + x = self.forward_tokens(x) + x_cls = self.head(x[:,0]) + + + if self.return_dense: + # import pdb + # pdb.set_trace() + x_aux = self.aux_head(x[:,1:]) + if not self.training: + return x_cls+0.5*x_aux.max(1)[0] + + # recover the mixed part + if self.mix_token and self.training: + x_aux = x_aux.reshape(x_aux.shape[0],patch_h, patch_w,x_aux.shape[-1]) + temp_x = x_aux.clone() + # print("===================python print===================") + # print("x_aux shape after clone", x_aux.shape) + # print("x_aux stride after clone", x_aux.stride()) + # print("x_aux format after clone", x_aux.storage().npu_format()) + # print("temp_x shape after clone", temp_x.shape) + # print("temp_x stride after clone", temp_x.stride()) + # print("temp_x format after clone", temp_x.storage().npu_format()) + # print("bbx1, bbx2, bby1, bby2: ", bbx1, bbx2, bby1, bby2) + # print("x_aux.flip(0)[:, bbx1:bbx2, bby1:bby2, :].shape: ", x_aux.flip(0)[:, bbx1:bbx2, bby1:bby2, :].shape) + # print("temp_x[:, bbx1:bbx2, bby1:bby2, :].shape: ", temp_x[:, bbx1:bbx2, bby1:bby2, :].shape) + # print("===================python print end===================") + temp_x[:, bbx1:bbx2, bby1:bby2, :] = x_aux.flip(0)[:, bbx1:bbx2, bby1:bby2, :] + x_aux = temp_x + x_aux = x_aux.reshape(x_aux.shape[0],patch_h*patch_w,x_aux.shape[-1]) + # print("===================python print===================") + # print("x_aux shape after reshape", x_aux.shape) + # print("x_aux stride after reshape", x_aux.stride()) + # print("x_aux format after reshape", x_aux.storage().npu_format()) + # print("===================python print end===================") + + return x_cls, x_aux, (bbx1, bby1, bbx2, bby2) + return x_cls + +@register_model +def vit(pretrained=False, **kwargs): + model = LV_ViT(patch_size=16, embed_dim=384, depth=16, num_heads=6, mlp_ratio=3., + p_emb=1, **kwargs) + model.default_cfg = default_cfgs['LV_ViT'] + return model + + +@register_model +def lvvit(pretrained=False, **kwargs): + model = LV_ViT(patch_size=16, embed_dim=384, depth=16, num_heads=6, mlp_ratio=3., + p_emb='4_2',skip_lam=2., **kwargs) + model.default_cfg = default_cfgs['LV_ViT'] + return model + +@register_model +def lvvit_s(pretrained=False, **kwargs): + model = LV_ViT(patch_size=16, embed_dim=384, depth=16, num_heads=6, mlp_ratio=3., + p_emb='4_2',skip_lam=2., return_dense=True,mix_token=True, **kwargs) + model.default_cfg = default_cfgs['LV_ViT'] + return model + +@register_model +def lvvit_m(pretrained=False, **kwargs): + model = LV_ViT(patch_size=16, embed_dim=512, depth=20, num_heads=8, mlp_ratio=3., + p_emb='4_2',skip_lam=2., return_dense=True,mix_token=True, **kwargs) + model.default_cfg = default_cfgs['LV_ViT_Medium'] + return model + + +@register_model +def lvvit_l(pretrained=False, **kwargs): + order = ['tr']*24 # this will override depth, can also be set as None + model = LV_ViT(patch_size=16, embed_dim=768,depth=24, num_heads=12, mlp_ratio=3., + p_emb='4_2_128',skip_lam=3., return_dense=True,mix_token=True, order=order, **kwargs) + model.default_cfg = default_cfgs['LV_ViT_Large'] + return model diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/utils/__init__.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b5c755f330d14c4dd47508b3acf377efa0cfbb65 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/utils/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +from .utils import load_pretrained_weights \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/utils/utils.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3aa6e03ace681290ba9335e6e6a662a0ab287e21 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/utils/utils.py @@ -0,0 +1,158 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +# Adapted for token labeling + +''' +- resize_pos_embed: resize position embedding +- load_for_transfer_learning: load pretrained paramters to model in transfer learning +- get_mean_and_std: calculate the mean and std value of dataset. +''' + +import os +import sys +import time +import torch +import math + +import torch.nn as nn +import torch.nn.init as init +import logging +import os +from collections import OrderedDict +import torch.nn.functional as F + +_logger = logging.getLogger(__name__) + +def resize_pos_embed(posemb, posemb_new): # example: 224:(14x14+1)-> 384: (24x24+1) + # Rescale the grid of position embeddings when loading from state_dict. Adapted from + # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224 + ntok_new = posemb_new.shape[1] + + posemb_tok, posemb_grid = posemb[:, :1], posemb[0, 1:] # posemb_tok is for cls token, posemb_grid for the following tokens + ntok_new -= 1 + gs_old = int(math.sqrt(len(posemb_grid))) # 14 + gs_new = int(math.sqrt(ntok_new)) # 24 + _logger.info('Position embedding grid-size from %s to %s', gs_old, gs_new) + posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2) # [1, 196, dim]->[1, 14, 14, dim]->[1, dim, 14, 14] + posemb_grid = F.interpolate(posemb_grid, size=(gs_new, gs_new), mode='bicubic') # [1, dim, 14, 14] -> [1, dim, 24, 24] + posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_new * gs_new, -1) # [1, dim, 24, 24] -> [1, 24*24, dim] + posemb = torch.cat([posemb_tok, posemb_grid], dim=1) # [1, 24*24+1, dim] + return posemb + +def resize_pos_embed_without_cls(posemb, posemb_new): # example: 224:(14x14+1)-> 384: (24x24+1) + # Rescale the grid of position embeddings when loading from state_dict. Adapted from + # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224 + ntok_new = posemb_new.shape[1] + posemb_grid = posemb[0] + gs_old = int(math.sqrt(len(posemb_grid))) # 14 + gs_new = int(math.sqrt(ntok_new)) # 24 + _logger.info('Position embedding grid-size from %s to %s', gs_old, gs_new) + posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2) # [1, 196, dim]->[1, 14, 14, dim]->[1, dim, 14, 14] + posemb_grid = F.interpolate(posemb_grid, size=(gs_new, gs_new), mode='bicubic') # [1, dim, 14, 14] -> [1, dim, 24, 24] + posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_new * gs_new, -1) # [1, dim, 24, 24] -> [1, 24*24, dim] + return posemb_grid + + +def resize_pos_embed_4d(posemb, posemb_new): # example: 224:(14x14+1)-> 384: (24x24+1) + # Rescale the grid of position embeddings when loading from state_dict. Adapted from + # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224 + gs_old = posemb.shape[1] # 14 + gs_new = posemb_new.shape[1] # 24 + _logger.info('Position embedding grid-size from %s to %s', gs_old, gs_new) + posemb_grid = posemb + posemb_grid = posemb_grid.permute(0, 3, 1, 2) # [1, 14, 14, dim]->[1, dim, 14, 14] + posemb_grid = F.interpolate(posemb_grid, size=(gs_new, gs_new), mode='bicubic') # [1, dim, 14, 14] -> [1, dim, 24, 24] + posemb_grid = posemb_grid.permute(0, 2, 3, 1) # [1, dim, 24, 24]->[1, 24, 24, dim] + return posemb_grid + + +def load_state_dict(checkpoint_path,model, use_ema=False, num_classes=1000): + if checkpoint_path and os.path.isfile(checkpoint_path): + checkpoint = torch.load(checkpoint_path, map_location='cpu') + state_dict_key = 'state_dict' + if isinstance(checkpoint, dict): + if use_ema and 'state_dict_ema' in checkpoint: + state_dict_key = 'state_dict_ema' + if state_dict_key and state_dict_key in checkpoint: + new_state_dict = OrderedDict() + for k, v in checkpoint[state_dict_key].items(): + # strip `module.` prefix + name = k[7:] if k.startswith('module') else k + new_state_dict[name] = v + state_dict = new_state_dict + else: + state_dict = checkpoint + _logger.info("Loaded {} from checkpoint '{}'".format(state_dict_key, checkpoint_path)) + if num_classes != state_dict['head.bias'].shape[0]: + # completely discard fully connected for all other differences between pretrained and created model + del state_dict['head.weight'] + del state_dict['head.bias'] + old_aux_head_weight = state_dict.pop('aux_head.weight', None) + old_aux_head_bias = state_dict.pop('aux_head.bias', None) + + + old_posemb = state_dict['pos_embed'] + if model.pos_embed.shape != old_posemb.shape: # need resize the position embedding by interpolate + if len(old_posemb.shape)==3: + if int(math.sqrt(old_posemb.shape[1]))**2==old_posemb.shape[1]: + new_posemb = resize_pos_embed_without_cls(old_posemb, model.pos_embed) + else: + new_posemb = resize_pos_embed(old_posemb, model.pos_embed) + elif len(old_posemb.shape)==4: + new_posemb = resize_pos_embed_4d(old_posemb, model.pos_embed) + state_dict['pos_embed'] = new_posemb + + return state_dict + else: + _logger.error("No checkpoint found at '{}'".format(checkpoint_path)) + raise FileNotFoundError() + + +def load_pretrained_weights(model, checkpoint_path, use_ema=False, strict=True, num_classes=1000): + state_dict = load_state_dict(checkpoint_path, model, use_ema, num_classes) + model.load_state_dict(state_dict, strict=strict) + + +def get_mean_and_std(dataset): + '''Compute the mean and std value of dataset.''' + dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2) + mean = torch.zeros(3) + std = torch.zeros(3) + print('==> Computing mean and std..') + for inputs, targets in dataloader: + for i in range(3): + mean[i] += inputs[:,i,:,:].mean() + std[i] += inputs[:,i,:,:].std() + mean.div_(len(dataset)) + std.div_(len(dataset)) + return mean, std + +def init_params(net): + '''Init layer parameters.''' + for m in net.modules(): + if isinstance(m, nn.Conv2d): + init.kaiming_normal(m.weight, mode='fan_out') + if m.bias: + init.constant(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + init.constant(m.weight, 1) + init.constant(m.bias, 0) + elif isinstance(m, nn.Linear): + init.normal(m.weight, std=1e-3) + if m.bias: + init.constant(m.bias, 0) + diff --git a/PyTorch/contrib/cv/classification/LVVIT/validate.py b/PyTorch/contrib/cv/classification/LVVIT/validate.py new file mode 100644 index 0000000000000000000000000000000000000000..3fc51aaef37d511919810e468a554b8b5c794b2e --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/validate.py @@ -0,0 +1,389 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 +#!/usr/bin/env python3 +""" ImageNet Validation Script +Adapted from https://github.com/rwightman/pytorch-image-models +The script is further extend to evaluate LV-ViT models + +""" +import argparse +import os +import csv +import glob +import time +import logging +import torch +import torch.nn as nn +import torch.nn.parallel +from collections import OrderedDict +from contextlib import suppress + +from timm.models import create_model, apply_test_time_pool, load_checkpoint, is_model, list_models +from timm.models.helpers import load_state_dict +from timm.data import create_dataset, resolve_data_config, RealLabelsImagenet +from timm.utils import accuracy, AverageMeter, natural_key, setup_default_logging, set_jit_legacy +from tlt.data import create_loader +import tlt.models + +has_apex = False +try: + from apex import amp + has_apex = True +except ImportError: + pass + +has_native_amp = False +try: + if getattr(torch.cuda.amp, 'autocast') is not None: + has_native_amp = True +except AttributeError: + pass + +torch.backends.cudnn.benchmark = True +_logger = logging.getLogger('validate') + + +parser = argparse.ArgumentParser(description='PyTorch ImageNet Validation') +parser.add_argument('data', metavar='DIR', + help='path to dataset') +parser.add_argument('--dataset', '-d', metavar='NAME', default='', + help='dataset type (default: ImageFolder/ImageTar if empty)') +parser.add_argument('--split', metavar='NAME', default='validation', + help='dataset split (default: validation)') +parser.add_argument('--model', '-m', metavar='NAME', default='dpn92', + help='model architecture (default: dpn92)') +parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', + help='number of data loading workers (default: 2)') +parser.add_argument('-b', '--batch-size', default=256, type=int, + metavar='N', help='mini-batch size (default: 256)') +parser.add_argument('--img-size', default=None, type=int, + metavar='N', help='Input image dimension, uses model default if empty') +parser.add_argument('--input-size', default=None, nargs=3, type=int, + metavar='N N N', help='Input all image dimensions (d h w, e.g. --input-size 3 224 224), uses model default if empty') +parser.add_argument('--crop-pct', default=None, type=float, + metavar='N', help='Input image center crop pct') +parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN', + help='Override mean pixel value of dataset') +parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD', + help='Override std deviation of of dataset') +parser.add_argument('--interpolation', default='', type=str, metavar='NAME', + help='Image resize interpolation type (overrides model)') +parser.add_argument('--num-classes', type=int, default=None, + help='Number classes in dataset') +parser.add_argument('--class-map', default='', type=str, metavar='FILENAME', + help='path to class to idx mapping file (default: "")') +parser.add_argument('--gp', default=None, type=str, metavar='POOL', + help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.') +parser.add_argument('--log-freq', default=50, type=int, + metavar='N', help='batch logging frequency (default: 10)') +parser.add_argument('--checkpoint', default='', type=str, metavar='PATH', + help='path to latest checkpoint (default: none)') +parser.add_argument('--pretrained', dest='pretrained', action='store_true', + help='use pre-trained model') +parser.add_argument('--num-gpu', type=int, default=1, + help='Number of GPUS to use') +parser.add_argument('--no-test-pool', dest='no_test_pool', action='store_true', + help='disable test time pool') +parser.add_argument('--no-prefetcher', action='store_true', default=False, + help='disable fast prefetcher') +parser.add_argument('--pin-mem', action='store_true', default=False, + help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') +parser.add_argument('--channels-last', action='store_true', default=False, + help='Use channels_last memory layout') +parser.add_argument('--amp', action='store_true', default=False, + help='Use AMP mixed precision. Defaults to Apex, fallback to native Torch AMP.') +parser.add_argument('--apex-amp', action='store_true', default=False, + help='Use NVIDIA Apex AMP mixed precision') +parser.add_argument('--native-amp', action='store_true', default=False, + help='Use Native Torch AMP mixed precision') +parser.add_argument('--tf-preprocessing', action='store_true', default=False, + help='Use Tensorflow preprocessing pipeline (require CPU TF installed') +parser.add_argument('--use-ema', dest='use_ema', action='store_true', + help='use ema version of weights if present') +parser.add_argument('--torchscript', dest='torchscript', action='store_true', + help='convert model torchscript for inference') +parser.add_argument('--legacy-jit', dest='legacy_jit', action='store_true', + help='use legacy jit mode for pytorch 1.5/1.5.1/1.6 to get back fusion performance') +parser.add_argument('--results-file', default='', type=str, metavar='FILENAME', + help='Output csv file for validation results (summary)') +parser.add_argument('--real-labels', default='', type=str, metavar='FILENAME', + help='Real labels JSON file for imagenet evaluation') +parser.add_argument('--valid-labels', default='', type=str, metavar='FILENAME', + help='Valid label indices txt file for validation of partial label space') + + +def validate(args): + # might as well try to validate something + args.pretrained = args.pretrained or not args.checkpoint + args.prefetcher = not args.no_prefetcher + amp_autocast = suppress # do nothing + if args.amp: + if has_native_amp: + args.native_amp = True + elif has_apex: + args.apex_amp = True + else: + _logger.warning("Neither APEX or Native Torch AMP is available.") + assert not args.apex_amp or not args.native_amp, "Only one AMP mode should be set." + if args.native_amp: + amp_autocast = torch.cuda.amp.autocast + _logger.info('Validating in mixed precision with native PyTorch AMP.') + elif args.apex_amp: + _logger.info('Validating in mixed precision with NVIDIA APEX AMP.') + else: + _logger.info('Validating in float32. AMP not enabled.') + + if args.legacy_jit: + set_jit_legacy() + device = torch.device(f"npu:0") + + # create model + model = create_model( + args.model, + pretrained=args.pretrained, + num_classes=args.num_classes, + in_chans=3, + global_pool=args.gp, + scriptable=args.torchscript, + img_size=args.img_size) + if args.num_classes is None: + assert hasattr(model, 'num_classes'), 'Model must have `num_classes` attr if not set on cmd line/config.' + args.num_classes = model.num_classes + + if args.checkpoint: + load_checkpoint(model, args.checkpoint, args.use_ema, strict=False) + + param_count = sum([m.numel() for m in model.parameters()]) + _logger.info('Model %s created, param count: %d' % (args.model, param_count)) + + data_config = resolve_data_config(vars(args), model=model, use_test_size=True) + test_time_pool = False + if not args.no_test_pool: + model, test_time_pool = apply_test_time_pool(model, data_config, use_test_size=True) + + if args.torchscript: + torch.jit.optimized_execution(True) + model = torch.jit.script(model) + + #model = model.cuda() + model = model.npu() + if args.apex_amp: + model = amp.initialize(model, opt_level='O1') + + if args.channels_last: + model = model.to(memory_format=torch.channels_last) + + if args.num_gpu > 1: + model = torch.nn.DataParallel(model, device_ids=list(range(args.num_gpu))) + + criterion = nn.CrossEntropyLoss().cuda() + + # dataset = create_dataset( + # root=args.data, name=args.dataset, split=args.split, + # load_bytes=args.tf_preprocessing, class_map=args.class_map) + + dataset = create_dataset( + name=args.dataset,root=args.data, split=args.split, is_training=False, batch_size=args.batch_size) + + + if args.valid_labels: + with open(args.valid_labels, 'r') as f: + valid_labels = {int(line.rstrip()) for line in f} + valid_labels = [i in valid_labels for i in range(args.num_classes)] + else: + valid_labels = None + + if args.real_labels: + real_labels = RealLabelsImagenet(dataset.filenames(basename=True), real_json=args.real_labels) + else: + real_labels = None + + crop_pct = 1.0 if test_time_pool else data_config['crop_pct'] + + ''' + loader = create_loader( + device, + dataset, + #dataset_train = dataset, + input_size=data_config['input_size'], + batch_size=args.batch_size, + use_prefetcher=args.prefetcher, + interpolation=data_config['interpolation'], + mean=data_config['mean'], + std=data_config['std'], + num_workers=args.workers, + crop_pct=crop_pct, + pin_memory=args.pin_mem, + tf_preprocessing=args.tf_preprocessing) + ''' + loader = create_loader( + device, + dataset, + input_size=data_config['input_size'], + batch_size=args.batch_size, + is_training=False, + use_prefetcher=args.prefetcher, + interpolation=data_config['interpolation'], + mean=data_config['mean'], + std=data_config['std'], + num_workers=args.workers, + #distributed=args.distributed, + crop_pct=data_config['crop_pct'], + pin_memory=args.pin_mem, + ) + + batch_time = AverageMeter() + losses = AverageMeter() + top1 = AverageMeter() + top5 = AverageMeter() + + model.eval() + with torch.no_grad(): + # warmup, reduce variability of first batch time, especially for comparing torchscript vs non + input = torch.randn((args.batch_size,) + data_config['input_size']).npu() + if args.channels_last: + input = input.contiguous(memory_format=torch.channels_last) + model(input) + end = time.time() + for batch_idx, (input, target) in enumerate(loader): + if args.no_prefetcher: + target = target.cuda() + input = input.cuda() + if args.channels_last: + input = input.contiguous(memory_format=torch.channels_last) + + # compute output + with amp_autocast(): + output = model(input) + if isinstance(output, (tuple, list)): + output = output[0] + if valid_labels is not None: + output = output[:, valid_labels] + loss = criterion(output, target) + + if real_labels is not None: + real_labels.add_result(output) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output.detach(), target, topk=(1, 5)) + losses.update(loss.item(), input.size(0)) + top1.update(acc1.item(), input.size(0)) + top5.update(acc5.item(), input.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if batch_idx % args.log_freq == 0: + _logger.info( + 'Test: [{0:>4d}/{1}] ' + 'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' + 'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f}) ' + 'Acc@1: {top1.val:>7.3f} ({top1.avg:>7.3f}) ' + 'Acc@5: {top5.val:>7.3f} ({top5.avg:>7.3f})'.format( + batch_idx, len(loader), batch_time=batch_time, + rate_avg=input.size(0) / batch_time.avg, + loss=losses, top1=top1, top5=top5)) + + if real_labels is not None: + # real labels mode replaces topk values at the end + top1a, top5a = real_labels.get_accuracy(k=1), real_labels.get_accuracy(k=5) + else: + top1a, top5a = top1.avg, top5.avg + results = OrderedDict( + top1=round(top1a, 4), top1_err=round(100 - top1a, 4), + top5=round(top5a, 4), top5_err=round(100 - top5a, 4), + param_count=round(param_count / 1e6, 2), + img_size=data_config['input_size'][-1], + cropt_pct=crop_pct, + interpolation=data_config['interpolation']) + + _logger.info(' * Acc@1 {:.3f} ({:.3f}) Acc@5 {:.3f} ({:.3f})'.format( + results['top1'], results['top1_err'], results['top5'], results['top5_err'])) + + return results + + +def main(): + setup_default_logging() + args = parser.parse_args() + model_cfgs = [] + model_names = [] + if os.path.isdir(args.checkpoint): + # validate all checkpoints in a path with same model + checkpoints = glob.glob(args.checkpoint + '/*.pth.tar') + checkpoints += glob.glob(args.checkpoint + '/*.pth') + model_names = list_models(args.model) + model_cfgs = [(args.model, c) for c in sorted(checkpoints, key=natural_key)] + else: + if args.model == 'all': + # validate all models in a list of names with pretrained checkpoints + args.pretrained = True + model_names = list_models(pretrained=True, exclude_filters=['*in21k']) + model_cfgs = [(n, '') for n in model_names] + elif not is_model(args.model): + # model name doesn't exist, try as wildcard filter + model_names = list_models(args.model) + model_cfgs = [(n, '') for n in model_names] + + if len(model_cfgs): + results_file = args.results_file or './results-all.csv' + _logger.info('Running bulk validation on these pretrained models: {}'.format(', '.join(model_names))) + results = [] + try: + start_batch_size = args.batch_size + for m, c in model_cfgs: + batch_size = start_batch_size + args.model = m + args.checkpoint = c + result = OrderedDict(model=args.model) + r = {} + while not r and batch_size >= args.num_gpu: + torch.cuda.empty_cache() + try: + args.batch_size = batch_size + print('Validating with batch size: %d' % args.batch_size) + r = validate(args) + except RuntimeError as e: + if batch_size <= args.num_gpu: + print("Validation failed with no ability to reduce batch size. Exiting.") + raise e + batch_size = max(batch_size // 2, args.num_gpu) + print("Validation failed, reducing batch size by 50%") + result.update(r) + if args.checkpoint: + result['checkpoint'] = args.checkpoint + results.append(result) + except KeyboardInterrupt as e: + pass + results = sorted(results, key=lambda x: x['top1'], reverse=True) + if len(results): + write_results(results_file, results) + else: + validate(args) + + +def write_results(results_file, results): + with open(results_file, mode='w') as cf: + dw = csv.DictWriter(cf, fieldnames=results[0].keys()) + dw.writeheader() + for r in results: + dw.writerow(r) + cf.flush() + + +if __name__ == '__main__': + main() diff --git a/PyTorch/contrib/cv/classification/LVVIT/visualize/baselines/ViT/LVViT_LRP.py b/PyTorch/contrib/cv/classification/LVVIT/visualize/baselines/ViT/LVViT_LRP.py new file mode 100644 index 0000000000000000000000000000000000000000..e13780bed2316b7302480387aa1631e5b557a546 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LVVIT/visualize/baselines/ViT/LVViT_LRP.py @@ -0,0 +1,505 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#coding=utf-8 + +""" Vision Transformer (ViT) in PyTorch +adapted from https://github.com/hila-chefer/Transformer-Explainability/blob/main/baselines/ViT/ViT_LRP.py +""" +import torch +import torch.nn as nn +from einops import rearrange +from modules.layers_ours import * + +from baselines.ViT.helpers import load_pretrained +from baselines.ViT.weight_init import trunc_normal_ +from baselines.ViT.layer_helpers import to_2tuple + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', + 'first_conv': 'patch_embed.proj', 'classifier': 'head', + **kwargs + } + + +default_cfgs = { + # patch models + 'lvvit_base_patch16_224': _cfg( + url='https://github.com/zihangJiang/TokenLabeling/releases/download/1.0/lvvit_m-56M-224-84.0.pth.tar', + mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), + ), + 'lvvit_small_patch16_224': _cfg( + url='https://github.com/zihangJiang/TokenLabeling/releases/download/1.0/lvvit_s-26M-224-83.3.pth.tar', + mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), + ), + 'lvvit_small_patch16_384': _cfg( + url='https://github.com/zihangJiang/TokenLabeling/releases/download/1.0/lvvit_s-26M-384-84.4.pth.tar', + mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), + ) +} + +def compute_rollout_attention(all_layer_matrices, start_layer=0): + # adding residual consideration + num_tokens = all_layer_matrices[0].shape[1] + batch_size = all_layer_matrices[0].shape[0] + eye = torch.eye(num_tokens).expand(batch_size, num_tokens, num_tokens).to(all_layer_matrices[0].device) + all_layer_matrices = [all_layer_matrices[i] + eye for i in range(len(all_layer_matrices))] + # all_layer_matrices = [all_layer_matrices[i] / all_layer_matrices[i].sum(dim=-1, keepdim=True) + # for i in range(len(all_layer_matrices))] + joint_attention = all_layer_matrices[start_layer] + for i in range(start_layer+1, len(all_layer_matrices)): + joint_attention = all_layer_matrices[i].bmm(joint_attention) + return joint_attention + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = Linear(in_features, hidden_features) + self.act = GELU() + self.fc2 = Linear(hidden_features, out_features) + self.drop = Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + def relprop(self, cam, **kwargs): + cam = self.drop.relprop(cam, **kwargs) + cam = self.fc2.relprop(cam, **kwargs) + cam = self.act.relprop(cam, **kwargs) + cam = self.fc1.relprop(cam, **kwargs) + return cam + + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False,attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = head_dim ** -0.5 + + # A = Q*K^T + self.matmul1 = einsum('bhid,bhjd->bhij') + # attn = A*V + self.matmul2 = einsum('bhij,bhjd->bhid') + + self.qkv = Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = Dropout(attn_drop) + self.proj = Linear(dim, dim) + self.proj_drop = Dropout(proj_drop) + self.softmax = Softmax(dim=-1) + + self.attn_cam = None + self.attn = None + self.v = None + self.v_cam = None + self.attn_gradients = None + + def get_attn(self): + return self.attn + + def save_attn(self, attn): + self.attn = attn + + def save_attn_cam(self, cam): + self.attn_cam = cam + + def get_attn_cam(self): + return self.attn_cam + + def get_v(self): + return self.v + + def save_v(self, v): + self.v = v + + def save_v_cam(self, cam): + self.v_cam = cam + + def get_v_cam(self): + return self.v_cam + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def forward(self, x): + b, n, _, h = *x.shape, self.num_heads + qkv = self.qkv(x) + q, k, v = rearrange(qkv, 'b n (qkv h d) -> qkv b h n d', qkv=3, h=h) + + self.save_v(v) + + dots = self.matmul1([q, k]) * self.scale + + attn = self.softmax(dots) + attn = self.attn_drop(attn) + + self.save_attn(attn) + attn.register_hook(self.save_attn_gradients) + + out = self.matmul2([attn, v]) + out = rearrange(out, 'b h n d -> b n (h d)') + + out = self.proj(out) + out = self.proj_drop(out) + return out + + def relprop(self, cam, **kwargs): + cam = self.proj_drop.relprop(cam, **kwargs) + cam = self.proj.relprop(cam, **kwargs) + cam = rearrange(cam, 'b n (h d) -> b h n d', h=self.num_heads) + + # attn = A*V + (cam1, cam_v)= self.matmul2.relprop(cam, **kwargs) + cam1 /= 2 + cam_v /= 2 + + self.save_v_cam(cam_v) + self.save_attn_cam(cam1) + + cam1 = self.attn_drop.relprop(cam1, **kwargs) + cam1 = self.softmax.relprop(cam1, **kwargs) + + # A = Q*K^T + (cam_q, cam_k) = self.matmul1.relprop(cam1, **kwargs) + cam_q /= 2 + cam_k /= 2 + + cam_qkv = rearrange([cam_q, cam_k, cam_v], 'qkv b h n d -> b n (qkv h d)', qkv=3, h=self.num_heads) + + return self.qkv.relprop(cam_qkv, **kwargs) + + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.): + super().__init__() + self.norm1 = LayerNorm(dim, eps=1e-6) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + self.norm2 = LayerNorm(dim, eps=1e-6) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, drop=drop) + + self.add1 = Add() + self.add2 = Add() + self.clone1 = Clone() + self.clone2 = Clone() + + def forward(self, x): + x1, x2 = self.clone1(x, 2) + x = self.add1([x1, self.attn(self.norm1(x2))/2.0]) + x1, x2 = self.clone2(x, 2) + x = self.add2([x1, self.mlp(self.norm2(x2))/2.0]) + return x + + def relprop(self, cam, **kwargs): + (cam1, cam2) = self.add2.relprop(cam, **kwargs) + cam2 = self.mlp.relprop(cam2, **kwargs) + cam2 = self.norm2.relprop(cam2, **kwargs) + cam = self.clone2.relprop((cam1, cam2), **kwargs) + + (cam1, cam2) = self.add1.relprop(cam, **kwargs) + cam2 = self.attn.relprop(cam2, **kwargs) + cam2 = self.norm1.relprop(cam2, **kwargs) + cam = self.clone1.relprop((cam1, cam2), **kwargs) + return cam + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x): + B, C, H, W = x.shape + # FIXME look at relaxing size constraints + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x).flatten(2).transpose(1, 2) + return x + + def relprop(self, cam, **kwargs): + cam = cam.transpose(1,2) + cam = cam.reshape(cam.shape[0], cam.shape[1], + (self.img_size[0] // self.patch_size[0]), (self.img_size[1] // self.patch_size[1])) + return self.proj.relprop(cam, **kwargs) + +class PatchEmbed4_2(nn.Module): + """ + Image to Patch Embedding with 4 layer convolution + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + + new_patch_size = to_2tuple(patch_size // 2) + + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + self.embed_dim = embed_dim + + self.conv1 = Conv2d(in_chans, 64, kernel_size=7, stride=2, padding=3, bias=False) # 112x112 + self.bn1 = BatchNorm2d(64) + self.relu = ReLU(inplace=True) + self.conv2 = Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False) # 112x112 + self.bn2 = BatchNorm2d(64) + self.conv3 = Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.bn3 = BatchNorm2d(64) + + self.proj = Conv2d(64, embed_dim, kernel_size=new_patch_size, stride=new_patch_size) + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + + x = self.conv3(x) + x = self.bn3(x) + x = self.relu(x) + + x = self.proj(x) # [B, C, W, H] + + return x.flatten(2).transpose(1, 2) + def relprop(self, cam, **kwargs): + cam = cam.transpose(1,2) + cam = cam.reshape(cam.shape[0], cam.shape[1], + (self.img_size[0] // self.patch_size[0]), (self.img_size[1] // self.patch_size[1])) + cam = self.proj.relprop(cam, **kwargs) + cam = self.bn3.relprop(cam, **kwargs) + cam = self.conv3.relprop(cam, **kwargs) + cam = self.bn2.relprop(cam, **kwargs) + cam = self.conv2.relprop(cam, **kwargs) + cam = self.bn1.relprop(cam, **kwargs) + cam = self.conv1.relprop(cam, **kwargs) + return cam + +class VisionTransformer(nn.Module): + """ Vision Transformer with support for patch or hybrid CNN input stage + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=False, mlp_head=False, drop_rate=0., attn_drop_rate=0.): + super().__init__() + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.patch_embed = PatchEmbed4_2( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, + drop=drop_rate, attn_drop=attn_drop_rate) + for i in range(depth)]) + + self.norm = LayerNorm(embed_dim) + if mlp_head: + # paper diagram suggests 'MLP head', but results in 4M extra parameters vs paper + self.head = Mlp(embed_dim, int(embed_dim * mlp_ratio), num_classes) + else: + # with a single Linear layer as head, the param count within rounding of paper + self.head = Linear(embed_dim, num_classes) + self.aux_head = Linear(embed_dim, num_classes) + # FIXME not quite sure what the proper weight init is supposed to be, + # normal / trunc normal w/ std == .02 similar to other Bert like transformers + trunc_normal_(self.pos_embed, std=.02) # embeddings same as weights? + trunc_normal_(self.cls_token, std=.02) + self.apply(self._init_weights) + + self.pool = IndexSelect() + self.add = Add() + + self.inp_grad = None + + def save_inp_grad(self,grad): + self.inp_grad = grad + + def get_inp_grad(self): + return self.inp_grad + + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @property + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def forward(self, x): + B = x.shape[0] + x = self.patch_embed(x) + + cls_tokens = self.cls_token.expand(B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + x = self.add([x, self.pos_embed]) + + x.register_hook(self.save_inp_grad) + + for blk in self.blocks: + x = blk(x) + + x = self.norm(x) + x = self.pool(x, dim=1, indices=torch.tensor(0, device=x.device)) + x = x.squeeze(1) + x = self.head(x) + return x + + def relprop(self, cam=None,method="transformer_attribution", is_ablation=False, start_layer=0, **kwargs): + # print(kwargs) + # print("conservation 1", cam.sum()) + cam = self.head.relprop(cam, **kwargs) + cam = cam.unsqueeze(1) + cam = self.pool.relprop(cam, **kwargs) + cam = self.norm.relprop(cam, **kwargs) + for blk in reversed(self.blocks): + cam = blk.relprop(cam, **kwargs) + + # print("conservation 2", cam.sum()) + # print("min", cam.min()) + + if method == "full": + (cam, _) = self.add.relprop(cam, **kwargs) + cam = cam[:, 1:] + cam = self.patch_embed.relprop(cam, **kwargs) + # sum on channels + cam = cam.sum(dim=1) + return cam + + elif method == "rollout": + # cam rollout + attn_cams = [] + for blk in self.blocks: + attn_heads = blk.attn.get_attn_cam().clamp(min=0) + avg_heads = (attn_heads.sum(dim=1) / attn_heads.shape[1]).detach() + attn_cams.append(avg_heads) + cam = compute_rollout_attention(attn_cams, start_layer=start_layer) + cam = cam[:, 0, 1:] + return cam + + # our method, method name grad is legacy + elif method == "transformer_attribution" or method == "grad": + cams = [] + for blk in self.blocks: + grad = blk.attn.get_attn_gradients() + cam = blk.attn.get_attn_cam() + cam = cam[0].reshape(-1, cam.shape[-1], cam.shape[-1]) + grad = grad[0].reshape(-1, grad.shape[-1], grad.shape[-1]) + cam = grad * cam + cam = cam.clamp(min=0).mean(dim=0) + cams.append(cam.unsqueeze(0)) + rollout = compute_rollout_attention(cams, start_layer=start_layer) + cam = rollout[:, 0, 1:] + return cam + + elif method == "last_layer": + cam = self.blocks[-1].attn.get_attn_cam() + cam = cam[0].reshape(-1, cam.shape[-1], cam.shape[-1]) + if is_ablation: + grad = self.blocks[-1].attn.get_attn_gradients() + grad = grad[0].reshape(-1, grad.shape[-1], grad.shape[-1]) + cam = grad * cam + cam = cam.clamp(min=0).mean(dim=0) + cam = cam[0, 1:] + return cam + + elif method == "last_layer_attn": + cam = self.blocks[-1].attn.get_attn() + cam = cam[0].reshape(-1, cam.shape[-1], cam.shape[-1]) + cam = cam.clamp(min=0).mean(dim=0) + cam = cam[0, 1:] + return cam + + elif method == "second_layer": + cam = self.blocks[1].attn.get_attn_cam() + cam = cam[0].reshape(-1, cam.shape[-1], cam.shape[-1]) + if is_ablation: + grad = self.blocks[1].attn.get_attn_gradients() + grad = grad[0].reshape(-1, grad.shape[-1], grad.shape[-1]) + cam = grad * cam + cam = cam.clamp(min=0).mean(dim=0) + cam = cam[0, 1:] + return cam + + +def _conv_filter(state_dict, patch_size=16): + """ convert patch embedding weight from manual patchify + linear proj to conv""" + out_dict = {} + for k, v in state_dict.items(): + if 'patch_embed.proj.weight' in k: + v = v.reshape((v.shape[0], 3, patch_size, patch_size)) + out_dict[k] = v + return out_dict + + +def lvvit_base_patch16_224(pretrained=False, **kwargs): + model = VisionTransformer( + patch_size=16, embed_dim=512, depth=20, num_heads=8, mlp_ratio=3, qkv_bias=False, **kwargs) + model.default_cfg = default_cfgs['lvvit_base_patch16_224'] + if pretrained: + load_pretrained(model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) + return model + +def lvvit_small_patch16_224(pretrained=False, **kwargs): + model = VisionTransformer( + patch_size=16, embed_dim=384, depth=16, num_heads=6, mlp_ratio=3, qkv_bias=False, **kwargs) + model.default_cfg = default_cfgs['lvvit_small_patch16_224'] + if pretrained: + load_pretrained(model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) + return model + +def lvvit_small_patch16_384(pretrained=False, **kwargs): + model = VisionTransformer( + patch_size=16, embed_dim=384, depth=16, num_heads=6, mlp_ratio=3, qkv_bias=False, img_size=384, **kwargs) + model.default_cfg = default_cfgs['lvvit_small_patch16_384'] + if pretrained: + load_pretrained(model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) + return model