diff --git a/PyTorch/contrib/cv/classification/LVVIT/LICENSE b/PyTorch/contrib/cv/classification/LVVIT/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..6c1cbb5efebe718b26faa414d1835a92a47c5f0a
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2021 Zihang Jiang
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/PyTorch/contrib/cv/classification/LVVIT/README.md b/PyTorch/contrib/cv/classification/LVVIT/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0bccf23ce933350457a781753ec4254e1592f53f
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/README.md
@@ -0,0 +1,92 @@
+
+
+## LV-ViT 
+
+All Tokens Matter: Token Labeling for Training Better Vision Transformers ,based Transformer model for image classification, detail in  ([arxiv](https://arxiv.org/abs/2104.10858))
+
+## Requirements
+
+torch>=1.4.0
+torchvision>=0.5.0
+pyyaml
+scipy
+timm==0.4.5
+
+data prepare: ImageNet with the following folder structure
+
+```
+│imagenet/
+├──train/
+│  ├── n01440764
+│  │   ├── n01440764_10026.JPEG
+│  │   ├── n01440764_10027.JPEG
+│  │   ├── ......
+│  ├── ......
+├──val/
+│  ├── n01440764
+│  │   ├── ILSVRC2012_val_00000293.JPEG
+│  │   ├── ILSVRC2012_val_00002138.JPEG
+│  │   ├── ......
+│  ├── ......
+```
+
+## Label generation
+
+To generate token label data for training:
+
+```bash
+python3 generate_label.py /path/to/imagenet/train /path/to/save/label_top5_train_nfnet --model dm_nfnet_f6 --pretrained --img-size 576 -b 32 --crop-pct 1.0
+```
+
+also provided genarated labeled date in  [BaiDu Yun](https://pan.baidu.com/s/1YBqiNN9dAzhEXtPl61bZJw) (password: y6j2)
+
+## Model Train
+
+Train the LV-ViT-S: 
+
+```python
+1:train on 1 NPU
+bash /test/train_full_1p.sh '/Path_to_Imagenet' 'Path_to_Token-label-data'
+Example: bash /test/train_full_1p.sh '/opt/npu/imagenet/' './label_top5_train_nfnet'
+
+2:train on 8 NPU
+bash /test/train_full_8p.sh '/Path_to_Imagenet' 'Path_to_Token-label-data'
+Example: bash /test/train_full_8p.sh '/opt/npu/imagenet/' './label_top5_train_nfnet'
+```
+
+Get model performance
+
+```python
+1:test 1p performance
+bash test/train_performance_1p.sh '/Path_to_Imagenet/' '/Path_to_Token-label-data/'
+Example: bash test/train_performance_1p.sh  '/opt/npu/imagenet/' './label_top5_train_nfnet'
+2:test 8p performance
+bash test/train_performance_8p.sh '/Path_to_Imagenet/' '/Path_to_Token-label-data/'
+Example: bash test/train_performance_8p.sh '/opt/npu/imagenet/' './label_top5_train_nfnet'
+```
+
+## Validation
+
+Replace DATA_DIR with your imagenet validation set path and MODEL_DIR with the checkpoint path
+```python
+bash test/train_eval_8p.sh '/PATHTO/imagenet/val' '/PATHTO/LVVIT/eval_pth' 
+Example:test/train_eval_8p.sh '/opt/npu/imagenet/val' '/trained/model.pth.tar'
+```
+
+## Fine-tuning
+
+To Fine-tune the pre-trained LV-ViT-S
+```python
+bash /test/train_finetune_1p.sh '/Path_to_Imagenet/' '/Path_to_Token-label-data/' '/Pah_to_Trained_pth/'
+Example: bash /test/train_full_1p.sh '/opt/npu/imagenet/' './label_top5_train_nfnet' './finetune/lvvit_s-26m-224-83.3.pth.tar'
+```
+
+
+
+## About Train FPS
+
+```yaml
+Example log:Train: 257 [ 150/625 ( 24%)]  Loss:  9.841134 (10.1421)  Time: 1.941s, 1054.88/s  (2.048s, 1000.09/s)  LR: 4.609e-04  Data: 0.029 (0.062)
+As log  above get FPS：1054.88
+```
+
diff --git a/PyTorch/contrib/cv/classification/LVVIT/distributed_train.sh b/PyTorch/contrib/cv/classification/LVVIT/distributed_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..36196ea622f7bd832ded0b7ff348b71d1127ece3
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/distributed_train.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+NUM_PROC=$1
+shift
+python3 -m torch.distributed.launch --nproc_per_node=$NUM_PROC main.py "$@"
+
diff --git a/PyTorch/contrib/cv/classification/LVVIT/flops_computation.py b/PyTorch/contrib/cv/classification/LVVIT/flops_computation.py
new file mode 100644
index 0000000000000000000000000000000000000000..76b71f8cd79c8ccfbe95fd82e38161fb7796fc9e
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/flops_computation.py
@@ -0,0 +1,35 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+import tlt.models
+# summary of model flops and parameters
+
+model_list = [tlt.models.lvvit_s,
+              tlt.models.lvvit_m,
+              tlt.models.lvvit_l]
+
+img_size_list=[224,288,384,448]
+
+for img_size in img_size_list:
+    for model_name in model_list:
+        model = model_name(img_size=img_size)
+        params =  sum([m.numel() for m in model.parameters()])
+        flops = model.patch_embed.flops()
+        for blk in model.blocks:
+            flops = flops + blk.flops(model.patch_embed.num_patches+1)
+        print("model: {}, img_size:{},\nparams:{:.2f} M, flops: {:.2f} G \n".format(model_name.__name__, img_size, params/1e6, flops/1e9))
+
+    print('-----------------------')
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LVVIT/generate_label.py b/PyTorch/contrib/cv/classification/LVVIT/generate_label.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad9131147dda5049271621f708157d5c087f4859
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/generate_label.py
@@ -0,0 +1,342 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+#!/usr/bin/env python3
+""" 
+Script to generate token label.
+Adapted from https://github.com/rwightman/pytorch-image-models
+"""
+import argparse
+import os
+import csv
+import glob
+import time
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+from collections import OrderedDict
+from contextlib import suppress
+import numpy as np
+
+from timm.models import create_model, load_checkpoint, is_model, list_models
+from timm.data import create_dataset, create_loader, resolve_data_config
+from timm.utils import accuracy, AverageMeter, natural_key, setup_default_logging, set_jit_legacy
+from PIL import Image
+
+from timm.data import ImageDataset
+import logging
+
+_logger = logging.getLogger(__name__)
+
+
+_ERROR_RETRY = 50
+
+has_apex = False
+try:
+    from apex import amp
+    has_apex = True
+except ImportError:
+    pass
+
+has_native_amp = False
+try:
+    if getattr(torch.cuda.amp, 'autocast') is not None:
+        has_native_amp = True
+except AttributeError:
+    pass
+
+torch.backends.cudnn.benchmark = True
+_logger = logging.getLogger('labeling')
+
+
+parser = argparse.ArgumentParser(description='Generate token label')
+parser.add_argument('data', metavar='DIR',
+                    help='path to dataset')
+parser.add_argument('token_label_root', metavar='DIR',
+                    help='path to dataset')
+parser.add_argument('--dataset', '-d', metavar='NAME', default='',
+                    help='dataset type (default: ImageFolder/ImageTar if empty)')
+parser.add_argument('--split', metavar='NAME', default='validation',
+                    help='dataset split (default: validation)')
+parser.add_argument('--model', '-m', metavar='NAME', default='dpn92',
+                    help='model architecture (default: dpn92)')
+parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                    help='number of data loading workers (default: 2)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+                    metavar='N', help='mini-batch size (default: 256)')
+parser.add_argument('--img-size', default=None, type=int,
+                    metavar='N', help='Input image dimension, uses model default if empty')
+parser.add_argument('--input-size', default=None, nargs=3, type=int,
+                    metavar='N N N', help='Input all image dimensions (d h w, e.g. --input-size 3 224 224), uses model default if empty')
+parser.add_argument('--crop-pct', default=None, type=float,
+                    metavar='N', help='Input image center crop pct')
+parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+                    help='Override mean pixel value of dataset')
+parser.add_argument('--std', type=float,  nargs='+', default=None, metavar='STD',
+                    help='Override std deviation of of dataset')
+parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+                    help='Image resize interpolation type (overrides model)')
+parser.add_argument('--num-classes', type=int, default=None,
+                    help='Number classes in dataset')
+parser.add_argument('--class-map', default='', type=str, metavar='FILENAME',
+                    help='path to class to idx mapping file (default: "")')
+parser.add_argument('--log-freq', default=10, type=int,
+                    metavar='N', help='batch logging frequency (default: 10)')
+parser.add_argument('--checkpoint', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+parser.add_argument('--num-gpu', type=int, default=1,
+                    help='Number of GPUS to use')
+parser.add_argument('--pin-mem', action='store_true', default=False,
+                    help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+parser.add_argument('--channels-last', action='store_true', default=False,
+                    help='Use channels_last memory layout')
+parser.add_argument('--amp', action='store_true', default=False,
+                    help='Use AMP mixed precision. Defaults to Apex, fallback to native Torch AMP.')
+parser.add_argument('--apex-amp', action='store_true', default=False,
+                    help='Use NVIDIA Apex AMP mixed precision')
+parser.add_argument('--native-amp', action='store_true', default=False,
+                    help='Use Native Torch AMP mixed precision')
+parser.add_argument('--tf-preprocessing', action='store_true', default=False,
+                    help='Use Tensorflow preprocessing pipeline (require CPU TF installed')
+parser.add_argument('--use-ema', dest='use_ema', action='store_true',
+                    help='use ema version of weights if present')
+parser.add_argument('--torchscript', dest='torchscript', action='store_true',
+                    help='convert model torchscript for inference')
+parser.add_argument('--legacy-jit', dest='legacy_jit', action='store_true',
+                    help='use legacy jit mode for pytorch 1.5/1.5.1/1.6 to get back fusion performance')
+parser.add_argument('--transfer', action='store_true', default=False,
+                    help='disable evaluation due to dataset mismatch. Can be used to generate label for other dataset using imagenet pre-trained model')
+
+class ImageDatasetWithIndex(ImageDataset):
+
+    def __getitem__(self, index):
+        img, target = self.parser[index]
+        try:
+            img = img.read() if self.load_bytes else Image.open(img).convert('RGB')
+        except Exception as e:
+            _logger.warning(f'Skipped sample (index {index}, file {self.parser.filename(index)}). {str(e)}')
+            self._consecutive_errors += 1
+            if self._consecutive_errors < _ERROR_RETRY:
+                return self.__getitem__((index + 1) % len(self.parser))
+            else:
+                raise e
+        self._consecutive_errors = 0
+        if self.transform is not None:
+            img = self.transform(img)
+        if target is None:
+            target = torch.tensor(-1, dtype=torch.long)
+        return img, target, index
+
+class TokenLabelHead(nn.Module):
+    def __init__(self, base):
+        super(TokenLabelHead, self).__init__()
+        self.base = base
+        base_fc = self.base.get_classifier()
+        if hasattr(self.base, 'aux_head'):
+            base_fc = self.base.aux_head
+        if isinstance(base_fc, nn.Conv2d):
+            self.fc = base_fc
+        else:
+            self.fc = nn.Conv2d(
+                self.base.num_features, self.base.num_classes, kernel_size=1, bias=True)
+            self.fc.weight.data.copy_(base_fc.weight.data.view(self.fc.weight.size()))
+            self.fc.bias.data.copy_(base_fc.bias.data.view(self.fc.bias.size()))
+        self.base.reset_classifier(0)  # delete original fc layer
+
+    def forward(self, x):
+        x = self.base.forward_features(x)
+
+        if len(x.shape)==3:
+            # reshape for ViT like token based models
+            B,N,C = x.shape
+            H = int(N**0.5)
+            if N==H*H+1:
+                # remove cls token
+                x = x[:,1:]
+            x = x.transpose(1,2).reshape(B,C,H,H)
+        else:
+            assert x.shape[2]==x.shape[3], 'shape should be B,C,H,H'
+
+        x = self.fc(x)
+        x = x.permute(0,2,3,1)
+        return x
+
+
+
+def validate(args):
+    # might as well try to validate something
+    args.pretrained = args.pretrained or not args.checkpoint
+    args.prefetcher = False
+    amp_autocast = suppress  # do nothing
+    if args.amp:
+        if has_native_amp:
+            args.native_amp = True
+        elif has_apex:
+            args.apex_amp = True
+        else:
+            _logger.warning("Neither APEX or Native Torch AMP is available.")
+    assert not args.apex_amp or not args.native_amp, "Only one AMP mode should be set."
+    if args.native_amp:
+        amp_autocast = torch.cuda.amp.autocast
+        _logger.info('Generating label in mixed precision with native PyTorch AMP.')
+    elif args.apex_amp:
+        _logger.info('Generating label in mixed precision with NVIDIA APEX AMP.')
+    else:
+        _logger.info('Generating label in float32. AMP not enabled.')
+
+    if args.legacy_jit:
+        set_jit_legacy()
+
+    # create model
+    model = create_model(
+        args.model,
+        pretrained=args.pretrained,
+        num_classes=args.num_classes,
+        in_chans=3,
+        scriptable=args.torchscript)
+    if args.num_classes is None:
+        assert hasattr(model, 'num_classes'), 'Model must have `num_classes` attr if not set on cmd line/config.'
+        args.num_classes = model.num_classes
+
+    if args.checkpoint:
+        load_checkpoint(model, args.checkpoint, args.use_ema)
+
+    param_count = sum([m.numel() for m in model.parameters()])
+    _logger.info('Model %s created, param count: %d' % (args.model, param_count))
+
+    data_config = resolve_data_config(vars(args), model=model, use_test_size=True)
+    model = TokenLabelHead(model)
+    if args.torchscript:
+        torch.jit.optimized_execution(True)
+        model = torch.jit.script(model)
+
+    model = model.cuda()
+    if args.apex_amp:
+        model = amp.initialize(model, opt_level='O1')
+
+    if args.channels_last:
+        model = model.to(memory_format=torch.channels_last)
+
+    if args.num_gpu > 1:
+        model = torch.nn.DataParallel(model, device_ids=list(range(args.num_gpu)))
+
+    criterion = nn.CrossEntropyLoss().cuda()
+
+    dataset = ImageDatasetWithIndex(args.data, parser=args.dataset,
+        load_bytes=args.tf_preprocessing, class_map=args.class_map)
+
+
+
+    crop_pct = data_config['crop_pct']
+    loader = create_loader(
+        dataset,
+        input_size=data_config['input_size'],
+        batch_size=args.batch_size,
+        use_prefetcher=args.prefetcher,
+        interpolation=data_config['interpolation'],
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.workers,
+        crop_pct=crop_pct,
+        pin_memory=args.pin_mem,
+        tf_preprocessing=args.tf_preprocessing,)
+
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+    top5 = AverageMeter()
+    os.makedirs(args.token_label_root, exist_ok=True)
+    model.eval()
+    with torch.no_grad():
+        # warmup, reduce variability of first batch time, especially for comparing torchscript vs non
+        input = torch.randn((args.batch_size,) + data_config['input_size']).cuda()
+        if args.channels_last:
+            input = input.contiguous(memory_format=torch.channels_last)
+        model(input)
+        end = time.time()
+        for batch_idx, (input, target,idxs) in enumerate(loader):
+            target = target.cuda()
+            input = input.cuda()
+            if args.channels_last:
+                input = input.contiguous(memory_format=torch.channels_last)
+            # compute output
+            output = model(input)
+            value, indices = output.topk(5)
+            for i in range(input.shape[0]):
+                path = dataset.parser[idxs[i]][0].name
+                score_path = os.path.join(args.token_label_root,
+                    '/'.join(path.split('/')[-2:]).split('.')[0] + '.pt')
+                score_dict = os.path.join(args.token_label_root,path.split('/')[-2])
+                os.makedirs(score_dict,exist_ok=True)
+                # save top 5 value and index with shape [2, 5, H, W] 
+                torch.save(torch.stack([value[i].cpu().half(),indices[i].cpu().half()]).permute(0,3,1,2),score_path)
+
+            output = output.mean((1,2))
+            if args.transfer:
+                # do not record loss and acc
+                loss = torch.sum(output-output)
+                acc1, acc5 = loss, loss
+
+            else:
+                loss = criterion(output, target)
+
+                # measure accuracy and record loss
+                acc1, acc5 = accuracy(output.detach(), target, topk=(1, 5))
+            losses.update(loss.item(), input.size(0))
+            top1.update(acc1.item(), input.size(0))
+            top5.update(acc5.item(), input.size(0))
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if batch_idx % args.log_freq == 0:
+                _logger.info(
+                    'Test: [{0:>4d}/{1}]  '
+                    'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s)  '
+                    'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f})  '
+                    'Acc@1: {top1.val:>7.3f} ({top1.avg:>7.3f})  '
+                    'Acc@5: {top5.val:>7.3f} ({top5.avg:>7.3f})'.format(
+                        batch_idx, len(loader), batch_time=batch_time,
+                        rate_avg=input.size(0) / batch_time.avg,
+                        loss=losses, top1=top1, top5=top5))
+
+    top1a, top5a = top1.avg, top5.avg
+    results = OrderedDict(
+        top1=round(top1a, 4), top1_err=round(100 - top1a, 4),
+        top5=round(top5a, 4), top5_err=round(100 - top5a, 4),
+        param_count=round(param_count / 1e6, 2),
+        img_size=data_config['input_size'][-1],
+        cropt_pct=crop_pct,
+        interpolation=data_config['interpolation'])
+
+    _logger.info(' * Acc@1 {:.3f} ({:.3f}) Acc@5 {:.3f} ({:.3f})'.format(
+       results['top1'], results['top1_err'], results['top5'], results['top5_err']))
+
+    return results
+
+
+def main():
+    setup_default_logging()
+    args = parser.parse_args()
+    validate(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/classification/LVVIT/main.py b/PyTorch/contrib/cv/classification/LVVIT/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5200bf5869a29c6b9d7893de2a5d1452a9ae38
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/main.py
@@ -0,0 +1,883 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+#!/usr/bin/env python3
+""" ImageNet Training Script
+"""
+import argparse
+from ast import arg
+import time
+import yaml
+import os
+import logging
+from collections import OrderedDict
+from contextlib import suppress
+from datetime import datetime
+import numpy as np
+import random
+import torch
+import torch.nn as nn
+import torchvision.utils
+from torch.nn.parallel import DistributedDataParallel as NativeDDP
+
+from tlt.data import create_loader
+from timm.data import create_dataset, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset
+from timm.models import create_model, resume_checkpoint, load_checkpoint, convert_splitbn_model, model_parameters
+from timm.utils import *
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
+from timm.optim import create_optimizer
+from timm.scheduler import create_scheduler
+from timm.utils import ApexScaler, NativeScaler
+
+import tlt.models
+from tlt.data import create_token_label_target, TokenLabelMixup, FastCollateTokenLabelMixup, create_token_label_loader, create_token_label_dataset
+from tlt.loss import TokenLabelCrossEntropy, TokenLabelSoftTargetCrossEntropy
+from tlt.utils import load_pretrained_weights
+
+import time
+try:
+    from apex import amp
+    from apex.parallel import DistributedDataParallel as ApexDDP
+    from apex.parallel import convert_syncbn_model
+    has_apex = True
+except ImportError:
+    has_apex = False
+
+has_native_amp = False
+try:
+    if getattr(torch.cuda.amp, 'autocast') is not None:
+        has_native_amp = True
+except AttributeError:
+    pass
+
+torch.backends.cudnn.benchmark = True
+_logger = logging.getLogger('train')
+
+# The first arg parser parses out only the --config argument, this argument is used to
+# load a yaml file containing key-values that override the defaults for the main parser below
+config_parser = parser = argparse.ArgumentParser(description='Training Config', add_help=False)
+parser.add_argument('-c', '--config', default='', type=str, metavar='FILE',
+                    help='YAML config file specifying default arguments')
+
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+
+# Dataset / Model parameters
+parser.add_argument('data_dir', metavar='DIR',
+                    help='path to dataset')
+parser.add_argument('--dataset', '-d', metavar='NAME', default='',
+                    help='dataset type (default: ImageFolder/ImageTar if empty)')
+parser.add_argument('--train-split', metavar='NAME', default='train',
+                    help='dataset train split (default: train)')
+parser.add_argument('--val-split', metavar='NAME', default='validation',
+                    help='dataset validation split (default: validation)')
+parser.add_argument('--model', default='lvvit', type=str, metavar='MODEL',
+                    help='Name of model to train (default: "lvvit"')
+parser.add_argument('--pretrained', action='store_true', default=False,
+                    help='Start with pretrained version of specified network (if avail)')
+parser.add_argument('--initial-checkpoint', default='', type=str, metavar='PATH',
+                    help='Initialize model from this checkpoint (default: none)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='Resume full model and optimizer state from checkpoint (default: none)')
+parser.add_argument('--no-resume-opt', action='store_true', default=False,
+                    help='prevent resume of optimizer state when resuming model')
+parser.add_argument('--num-classes', type=int, default=None, metavar='N',
+                    help='number of label classes (Model default if None)')
+parser.add_argument('--gp', default=None, type=str, metavar='POOL',
+                    help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.')
+parser.add_argument('--img-size', type=int, default=None, metavar='N',
+                    help='Image patch size (default: None => model default)')
+parser.add_argument('--input-size', default=None, nargs=3, type=int,
+                    metavar='N N N', help='Input all image dimensions (d h w, e.g. --input-size 3 224 224), uses model default if empty')
+parser.add_argument('--crop-pct', default=None, type=float,
+                    metavar='N', help='Input image center crop percent (for validation only)')
+parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+                    help='Override mean pixel value of dataset')
+parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+                    help='Override std deviation of of dataset')
+parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+                    help='Image resize interpolation type (overrides model)')
+parser.add_argument('-b', '--batch-size', type=int, default=32, metavar='N',
+                    help='input batch size for training (default: 32)')
+parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N',
+                    help='ratio of validation batch size to training batch size (default: 1)')
+
+# Optimizer parameters
+parser.add_argument('--opt', default='adamw', type=str, metavar='OPTIMIZER',
+                    help='Optimizer (default: "adamw"')
+parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON',
+                    help='Optimizer Epsilon (default: None, use opt default)')
+parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA',
+                    help='Optimizer Betas (default: None, use opt default)')
+parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
+                    help='Optimizer momentum (default: 0.9)')
+parser.add_argument('--weight-decay', type=float, default=0.05,
+                    help='weight decay (default: 0.05)')
+parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM',
+                    help='Clip gradient norm (default: None, no clipping)')
+parser.add_argument('--clip-mode', type=str, default='norm',
+                    help='Gradient clipping mode. One of ("norm", "value", "agc")')
+
+
+# Learning rate schedule parameters
+parser.add_argument('--sched', default='cosine', type=str, metavar='SCHEDULER',
+                    help='LR scheduler (default: "cosine"')
+parser.add_argument('--lr', type=float, default=1.6e-3, metavar='LR',
+                    help='learning rate (default: 1.6e-3)')
+parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct',
+                    help='learning rate noise on/off epoch percentages')
+parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT',
+                    help='learning rate noise limit percent (default: 0.67)')
+parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV',
+                    help='learning rate noise std-dev (default: 1.0)')
+parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT',
+                    help='learning rate cycle len multiplier (default: 1.0)')
+parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N',
+                    help='learning rate cycle limit')
+parser.add_argument('--warmup-lr', type=float, default=1e-6, metavar='LR',
+                    help='warmup learning rate (default: 0.0001)')
+parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR',
+                    help='lower lr bound for cyclic schedulers that hit 0 (1e-5)')
+parser.add_argument('--epochs', type=int, default=300, metavar='N',
+                    help='number of epochs to train (default: 300)')
+parser.add_argument('--start-epoch', default=None, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('--decay-epochs', type=float, default=30, metavar='N',
+                    help='epoch interval to decay LR')
+parser.add_argument('--warmup-epochs', type=int, default=5, metavar='N',
+                    help='epochs to warmup LR, if scheduler supports')
+parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N',
+                    help='epochs to cooldown LR at min_lr, after cyclic schedule ends')
+parser.add_argument('--patience-epochs', type=int, default=10, metavar='N',
+                    help='patience epochs for Plateau LR scheduler (default: 10')
+parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE',
+                    help='LR decay rate (default: 0.1)')
+
+# Augmentation & regularization parameters
+parser.add_argument('--no-aug', action='store_true', default=False,
+                    help='Disable all training augmentation, override other train aug args')
+parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT',
+                    help='Random resize scale (default: 0.08 1.0)')
+parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO',
+                    help='Random resize aspect ratio (default: 0.75 1.33)')
+parser.add_argument('--hflip', type=float, default=0.5,
+                    help='Horizontal flip training aug probability')
+parser.add_argument('--vflip', type=float, default=0.,
+                    help='Vertical flip training aug probability')
+parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT',
+                    help='Color jitter factor (default: 0.4)')
+parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME',
+                    help='Use AutoAugment policy. "v0" or "original". (default: rand-m9-mstd0.5-inc1)'),
+parser.add_argument('--aug-splits', type=int, default=0,
+                    help='Number of augmentation splits (default: 0, valid: 0 or >=2)')
+parser.add_argument('--jsd', action='store_true', default=False,
+                    help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.')
+parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT',
+                    help='Random erase prob (default: 0.25)')
+parser.add_argument('--remode', type=str, default='pixel',
+                    help='Random erase mode (default: "pixel")')
+parser.add_argument('--recount', type=int, default=1,
+                    help='Random erase count (default: 1)')
+parser.add_argument('--resplit', action='store_true', default=False,
+                    help='Do not random erase first (clean) augmentation split')
+parser.add_argument('--mixup', type=float, default=0.0,
+                    help='mixup alpha, mixup enabled if > 0. (default: 0.)')
+parser.add_argument('--cutmix', type=float, default=0.0,
+                    help='cutmix alpha, cutmix enabled if > 0. (default: 0.)')
+parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None,
+                    help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
+parser.add_argument('--mixup-prob', type=float, default=1.0,
+                    help='Probability of performing mixup or cutmix when either/both is enabled')
+parser.add_argument('--mixup-switch-prob', type=float, default=0.5,
+                    help='Probability of switching to cutmix when both mixup and cutmix enabled')
+parser.add_argument('--mixup-mode', type=str, default='batch',
+                    help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')
+parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N',
+                    help='Turn off mixup after this epoch, disabled if 0 (default: 0)')
+parser.add_argument('--smoothing', type=float, default=0.1,
+                    help='Label smoothing (default: 0.1)')
+parser.add_argument('--train-interpolation', type=str, default='random',
+                    help='Training interpolation (random, bilinear, bicubic default: "random")')
+parser.add_argument('--drop', type=float, default=0.0, metavar='PCT',
+                    help='Dropout rate (default: 0.)')
+parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT',
+                    help='Drop connect rate, DEPRECATED, use drop-path (default: None)')
+parser.add_argument('--drop-path', type=float, default=None, metavar='PCT',
+                    help='Drop path rate (default: None)')
+parser.add_argument('--drop-block', type=float, default=None, metavar='PCT',
+                    help='Drop block rate (default: None)')
+
+# Batch norm parameters (only works with gen_efficientnet based models currently)
+parser.add_argument('--bn-tf', action='store_true', default=False,
+                    help='Use Tensorflow BatchNorm defaults for models that support it (default: False)')
+parser.add_argument('--bn-momentum', type=float, default=None,
+                    help='BatchNorm momentum override (if not None)')
+parser.add_argument('--bn-eps', type=float, default=None,
+                    help='BatchNorm epsilon override (if not None)')
+parser.add_argument('--sync-bn', action='store_true',
+                    help='Enable NVIDIA Apex or Torch synchronized BatchNorm.')
+parser.add_argument('--dist-bn', type=str, default='',
+                    help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")')
+parser.add_argument('--split-bn', action='store_true',
+                    help='Enable separate BN layers per augmentation split.')
+
+# Model Exponential Moving Average
+parser.add_argument('--model-ema', action='store_true', default=False,
+                    help='Enable tracking moving average of model weights')
+parser.add_argument('--model-ema-force-cpu', action='store_true', default=False,
+                    help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.')
+parser.add_argument('--model-ema-decay', type=float, default=0.99992,
+                    help='decay factor for model weights moving average (default: 0.99992)')
+
+# Misc
+parser.add_argument('--seed', type=int, default=42, metavar='S',
+                    help='random seed (default: 42)')
+parser.add_argument('--log-interval', type=int, default=50, metavar='N',
+                    help='how many batches to wait before logging training status')
+parser.add_argument('--recovery-interval', type=int, default=0, metavar='N',
+                    help='how many batches to wait before writing recovery checkpoint')
+parser.add_argument('--checkpoint-hist', type=int, default=10, metavar='N',
+                    help='number of checkpoints to keep (default: 10)')
+parser.add_argument('-j', '--workers', type=int, default=8, metavar='N',
+                    help='how many training processes to use (default: 1)')
+parser.add_argument('--save-images', action='store_true', default=False,
+                    help='save images of input bathes every log interval for debugging')
+parser.add_argument('--amp', action='store_true', default=False,
+                    help='use NVIDIA Apex AMP or Native AMP for mixed precision training')
+parser.add_argument('--apex-amp', action='store_true', default=False,
+                    help='Use NVIDIA Apex AMP mixed precision')
+parser.add_argument('--native-amp', action='store_true', default=False,
+                    help='Use Native Torch AMP mixed precision')
+parser.add_argument('--channels-last', action='store_true', default=False,
+                    help='Use channels_last memory layout')
+parser.add_argument('--pin-mem', action='store_true', default=False,
+                    help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+parser.add_argument('--no-prefetcher', action='store_true', default=False,
+                    help='disable fast prefetcher')
+parser.add_argument('--output', default='', type=str, metavar='PATH',
+                    help='path to output folder (default: none, current dir)')
+parser.add_argument('--eval-metric', default='top1', type=str, metavar='EVAL_METRIC',
+                    help='Best metric (default: "top1"')
+parser.add_argument('--tta', type=int, default=0, metavar='N',
+                    help='Test/inference time augmentation (oversampling) factor. 0=None (default: 0)')
+parser.add_argument("--local_rank", default=0, type=int)
+parser.add_argument('--use-multi-epochs-loader', action='store_true', default=False,
+                    help='use the multi-epochs-loader to save time at the beginning of every epoch')
+parser.add_argument('--torchscript', dest='torchscript', action='store_true',
+                    help='convert model torchscript for inference')
+
+# Token labeling
+
+parser.add_argument('--token-label', action='store_true', default=False,
+                    help='Use dense token-level label map for training')
+parser.add_argument('--token-label-data', type=str, default='', metavar='DIR',
+                    help='path to token_label data')
+parser.add_argument('--token-label-size', type=int, default=1, metavar='N',
+                    help='size of result token label map')
+parser.add_argument('--dense-weight', type=float, default=1.0,
+                    help='Token labeling loss multiplier (default: 1.0)')
+parser.add_argument('--cls-weight', type=float, default=1.0,
+                    help='Cls token prediction loss multiplier (default: 1.0)')
+parser.add_argument('--ground-truth', action='store_true', default=False,
+                    help='Use ground truth label to help refine generated target label')
+
+
+# Finetune
+parser.add_argument('--finetune', default='', type=str, metavar='PATH',
+                    help='path to checkpoint file (default: none)')
+parser.add_argument("--device_id", help="device_id", default=2, type=int)
+parser.add_argument("--opt_level", default='O1', type=str,
+                    help='Choose an optimize level, default O2')
+parser.add_argument("--loss_scale", default=None, type=int,
+                    help='set loss scale')
+parser.add_argument("--distributed", action='store_true', default=False,
+                    help='if distributed')
+
+
+def _parse_args():
+    # Do we have a config file to parse?
+    args_config, remaining = config_parser.parse_known_args()
+    if args_config.config:
+        with open(args_config.config, 'r') as f:
+            cfg = yaml.safe_load(f)
+            parser.set_defaults(**cfg)
+
+    # The main arg parser parses the rest of the args, the usual
+    # defaults will have been overridden if config file specified.
+    args = parser.parse_args(remaining)
+
+    # Cache the args as a text string to save them in the output dir later
+    args_text = yaml.safe_dump(args.__dict__, default_flow_style=False)
+    return args, args_text
+
+
+def main():
+    import random
+    setup_default_logging()
+    args, args_text = _parse_args()
+
+    os.environ['MASTER_ADDR'] = '127.0.0.1'
+    os.environ['MASTER_PORT'] = '29688'
+
+    # 固定seed
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    os.environ['PYTHONHASHSEED'] = str(args.seed)
+    device = torch.device(f"npu:{args.device_id}")
+    args.device = device
+    torch.npu.set_device(device)
+
+    args.prefetcher = not args.no_prefetcher
+    args.is_master_node = not args.distributed or args.device_id == 0
+
+    args.world_size = 1
+    args.rank = 0
+    if args.distributed:
+        args.world_size = int(os.environ['RANK_SIZE'])
+        args.rank = int(os.environ['RANK_ID'])
+        torch.distributed.init_process_group(backend='hccl', init_method='env://', world_size=args.world_size, rank=args.rank)
+        _logger.info('Training in distributed mode with multiple processes, 1 NPU per process. Process %d, total %d.'
+                     % (args.rank, args.world_size))
+    else:
+        _logger.info('Training with a single process on 1 GPUs.')
+
+    # resolve AMP arguments based on PyTorch / Apex availability
+    use_amp = None
+    if args.amp:
+        # `--amp` chooses native amp before apex (APEX ver not actively maintained)
+        if has_native_amp:
+            args.native_amp = True
+        elif has_apex:
+            args.apex_amp = True
+    if args.apex_amp and has_apex:
+        use_amp = 'apex'
+    elif args.native_amp and has_native_amp:
+        use_amp = 'native'
+    elif args.apex_amp or args.native_amp:
+        _logger.warning("Neither APEX or native Torch AMP is available, using float32. "
+                        "Install NVIDA apex or upgrade to PyTorch 1.6")
+
+    model = create_model(
+        args.model,
+        pretrained=args.pretrained,
+        num_classes=args.num_classes,
+        drop_rate=args.drop,
+        drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
+        drop_path_rate=args.drop_path,
+        drop_block_rate=args.drop_block,
+        global_pool=args.gp,
+        bn_tf=args.bn_tf,
+        bn_momentum=args.bn_momentum,
+        bn_eps=args.bn_eps,
+        scriptable=args.torchscript,
+        checkpoint_path=args.initial_checkpoint,
+        img_size=args.img_size)
+    if args.num_classes is None:
+        assert hasattr(model, 'num_classes'), 'Model must have `num_classes` attr if not set on cmd line/config.'
+        args.num_classes = model.num_classes  # FIXME handle model default vs config num_classes more elegantly
+    
+    if args.finetune:
+        load_pretrained_weights(model=model,checkpoint_path=args.finetune,use_ema=args.model_ema, strict=False, num_classes=args.num_classes)
+
+    if args.is_master_node:
+        _logger.info('Model %s created, param count: %d' %
+                     (args.model, sum([m.numel() for m in model.parameters()])))
+
+    data_config = resolve_data_config(vars(args), model=model, verbose=args.is_master_node)
+
+    # setup augmentation batch splits for contrastive loss or split bn
+    num_aug_splits = 0
+    if args.aug_splits > 0:
+        assert args.aug_splits > 1, 'A split of 1 makes no sense'
+        num_aug_splits = args.aug_splits
+
+    # enable split bn (separate bn stats per batch-portion)
+    if args.split_bn:
+        assert num_aug_splits > 1 or args.resplit
+        model = convert_splitbn_model(model, max(num_aug_splits, 2))
+
+    # move model to GPU, enable channels last layout if set
+    model.to(device)
+
+    if args.channels_last:
+        model = model.to(memory_format=torch.channels_last)
+
+    # setup synchronized BatchNorm for distributed training
+    if args.distributed and args.sync_bn:
+        assert not args.split_bn
+        if has_apex and use_amp != 'native':
+            # Apex SyncBN preferred unless native amp is activated
+            model = convert_syncbn_model(model)
+        else:
+            model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+        if args.is_master_node:
+            _logger.info(
+                'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using '
+                'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.')
+
+    if args.torchscript:
+        assert not use_amp == 'apex', 'Cannot use APEX AMP with torchscripted model'
+        assert not args.sync_bn, 'Cannot use SyncBatchNorm with torchscripted model'
+        model = torch.jit.script(model)
+
+    from apex.optimizers import NpuFusedAdam
+    optimizer = NpuFusedAdam(model.parameters(), lr=args.lr)
+
+    # setup automatic mixed-precision (AMP) loss scaling and op casting
+    amp_autocast = suppress  # do nothing
+    loss_scaler = None
+    optimizers=None
+    if use_amp == 'apex':
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale, combine_grad=True)
+        loss_scaler = ApexScaler()
+        if args.is_master_node:
+            _logger.info('Using NVIDIA APEX AMP. Training in mixed precision.')
+    elif use_amp == 'native':
+        amp_autocast = torch.cuda.amp.autocast
+        loss_scaler = NativeScaler()
+        if args.is_master_node:
+            _logger.info('Using native Torch AMP. Training in mixed precision.')
+    else:
+        if args.is_master_node:
+            _logger.info('AMP not enabled. Training in float32.')
+
+    # optionally resume from a checkpoint
+    resume_epoch = None
+    if args.resume:
+        resume_epoch = resume_checkpoint(
+            model, args.resume,
+            optimizer=None if args.no_resume_opt else optimizer,
+            loss_scaler=None if args.no_resume_opt else loss_scaler,
+            log_info="is_master_node")
+
+    # setup exponential moving average of model weights, SWA could be used here too
+    model_ema = None
+    if args.model_ema:
+        # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper
+        model_ema = ModelEmaV2(
+            model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else None)
+        if args.resume:
+            load_checkpoint(model_ema.module, args.resume, use_ema=True)
+
+    # setup distributed training
+    if args.distributed:
+        if args.is_master_node:
+            _logger.info("Using native Torch DistributedDataParallel.")
+        model = NativeDDP(model, device_ids=[args.device_id])  # can use device str in Torch >= 1.1
+    # NOTE: EMA model does not need to be wrapped by DDP
+
+    # setup learning rate schedule and starting epoch
+    lr_scheduler, num_epochs = create_scheduler(args, optimizer)
+    if args.epochs <= 5:
+        num_epochs = args.epochs
+    start_epoch = 0
+    if args.start_epoch is not None:
+        # a specified start_epoch will always override the resume epoch
+        start_epoch = args.start_epoch
+    elif resume_epoch is not None:
+        start_epoch = resume_epoch
+    if lr_scheduler is not None and start_epoch > 0:
+        lr_scheduler.step(start_epoch)
+
+    if args.is_master_node:
+        _logger.info('Scheduled epochs: {}'.format(num_epochs))
+
+    # create the train and eval datasets
+
+    # create token_label dataset
+    if args.token_label_data:
+        dataset_train = create_token_label_dataset(args.dataset, root=args.data_dir, label_root=args.token_label_data)
+    else:
+        dataset_train = create_dataset(
+            args.dataset, root=args.data_dir, split=args.train_split, is_training=True, batch_size=args.batch_size)
+    dataset_eval = create_dataset(
+        args.dataset, root=args.data_dir, split=args.val_split, is_training=False, batch_size=args.batch_size)
+
+    # setup mixup / cutmix
+    collate_fn = None
+    mixup_fn = None
+    mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None
+    if mixup_active:
+        mixup_args = dict(
+            mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax,
+            prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode,
+            label_smoothing=args.smoothing, num_classes=args.num_classes)
+        # create token_label mixup
+        if args.token_label_data:
+            mixup_args['label_size']=args.token_label_size
+            if args.prefetcher:
+                assert not num_aug_splits
+                collate_fn = FastCollateTokenLabelMixup(**mixup_args)
+            else:
+                mixup_fn = TokenLabelMixup(**mixup_args)
+        else:
+            if args.prefetcher:
+                assert not num_aug_splits  # collate conflict (need to support deinterleaving in collate mixup)
+                collate_fn = FastCollateMixup(**mixup_args)
+            else:
+                mixup_fn = Mixup(**mixup_args)
+
+    # wrap dataset in AugMix helper
+    if num_aug_splits > 1:
+        assert not args.token_label
+        dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits)
+
+    # create data loaders w/ augmentation pipeiine
+    train_interpolation = args.train_interpolation
+    if args.no_aug or not train_interpolation:
+        train_interpolation = data_config['interpolation']
+    if args.token_label and args.token_label_data:
+        use_token_label = True
+    else:
+        use_token_label = False
+    loader_train = create_token_label_loader(
+        device,
+        dataset_train,
+        input_size=data_config['input_size'],
+        batch_size=args.batch_size,
+        is_training=True,
+        use_prefetcher=args.prefetcher,
+        no_aug=args.no_aug,
+        re_prob=args.reprob,
+        re_mode=args.remode,
+        re_count=args.recount,
+        re_split=args.resplit,
+        scale=args.scale,
+        ratio=args.ratio,
+        hflip=args.hflip,
+        vflip=args.vflip,
+        color_jitter=args.color_jitter,
+        auto_augment=args.aa,
+        num_aug_splits=num_aug_splits,
+        interpolation=train_interpolation,
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.workers,
+        distributed=args.distributed,
+        collate_fn=collate_fn,
+        pin_memory=args.pin_mem,
+        use_multi_epochs_loader=args.use_multi_epochs_loader,
+        use_token_label=use_token_label
+    )
+
+    loader_eval = create_loader(
+        device,
+        dataset_eval,
+        input_size=data_config['input_size'],
+        batch_size=args.validation_batch_size_multiplier * args.batch_size,
+        is_training=False,
+        use_prefetcher=args.prefetcher,
+        interpolation=data_config['interpolation'],
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.workers,
+        distributed=args.distributed,
+        crop_pct=data_config['crop_pct'],
+        pin_memory=args.pin_mem,
+    )
+
+    # setup loss function
+
+    # use token_label loss
+    if args.token_label:
+        if args.token_label_size == 1:
+            # back to relabel/original ImageNet label
+            train_loss_fn = TokenLabelSoftTargetCrossEntropy().to(device)
+        else:
+            train_loss_fn = TokenLabelCrossEntropy(dense_weight=args.dense_weight, \
+                cls_weight=args.cls_weight, mixup_active=mixup_active, ground_truth=args.ground_truth).to(device)
+
+    else:
+        # smoothing is handled with mixup target transform or create_token_label_target function
+        train_loss_fn = SoftTargetCrossEntropy().to(device)
+
+    validate_loss_fn = nn.CrossEntropyLoss().to(device)
+
+    # setup checkpoint saver and eval metric tracking
+    eval_metric = args.eval_metric
+    best_metric = None
+    best_epoch = None
+    saver = None
+    output_dir = ''
+    if args.is_master_node:
+        output_base = args.output if args.output else './output'
+        exp_name = '-'.join([
+            datetime.now().strftime("%Y%m%d-%H%M%S"),
+            args.model,
+            str(data_config['input_size'][-1])
+        ])
+        output_dir = get_outdir(output_base, 'train', exp_name)
+        decreasing = True if eval_metric == 'loss' else False
+        saver = CheckpointSaver(
+            model=model, optimizer=optimizer, args=args, model_ema=model_ema, amp_scaler=loss_scaler,
+            checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=decreasing, max_history=args.checkpoint_hist)
+        with open(os.path.join(output_dir, 'args.yaml'), 'w') as f:
+            f.write(args_text)
+
+    try:
+        if args.finetune:
+            validate(device, model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast)
+        for epoch in range(start_epoch, num_epochs):
+            if args.distributed and hasattr(loader_train.sampler, 'set_epoch'):
+                loader_train.sampler.set_epoch(epoch)
+
+            train_metrics = train_one_epoch(device,
+                epoch, model, loader_train, optimizer, train_loss_fn, args,
+                lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir,
+                amp_autocast=amp_autocast, loss_scaler=loss_scaler, model_ema=model_ema, mixup_fn=mixup_fn, optimizers=optimizers)
+
+            #break
+
+            if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
+                if args.is_master_node:
+                    _logger.info("Distributing BatchNorm running means and vars")
+                distribute_bn(model, args.world_size, args.dist_bn == 'reduce')
+
+            eval_metrics = validate(device, model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast)
+
+            if model_ema is not None and not args.model_ema_force_cpu:
+                if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
+                    distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce')
+                ema_eval_metrics = validate(device,
+                                            model_ema.module, loader_eval, validate_loss_fn, args,
+                                            amp_autocast=amp_autocast,
+                                            log_suffix=' (EMA)')
+                eval_metrics = ema_eval_metrics
+
+            if lr_scheduler is not None:
+                # step LR for next epoch
+                lr_scheduler.step(epoch + 1, eval_metrics[eval_metric])
+
+            update_summary(
+                epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'),
+                write_header=best_metric is None)
+
+            if saver is not None:
+                # save proper checkpoint with eval metric
+                save_metric = eval_metrics[eval_metric]
+                best_metric, best_epoch = saver.save_checkpoint(epoch, metric=save_metric)
+
+    except KeyboardInterrupt:
+        pass
+    if best_metric is not None:
+        _logger.info('*** Best metric: {0} (epoch {1})'.format(best_metric, best_epoch))
+
+
+def train_one_epoch(
+        device,
+        epoch, model, loader, optimizer, loss_fn, args,
+        lr_scheduler=None, saver=None, output_dir='', amp_autocast=suppress,
+        loss_scaler=None, model_ema=None, mixup_fn=None, optimizers = None):
+
+    if args.mixup_off_epoch and epoch >= args.mixup_off_epoch:
+        if args.prefetcher and loader.mixup_enabled:
+            loader.mixup_enabled = False
+        elif mixup_fn is not None:
+            mixup_fn.mixup_enabled = False
+
+    second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order
+    batch_time_m = AverageMeter()
+    data_time_m = AverageMeter()
+    losses_m = AverageMeter()
+
+    model.train()
+
+    end = time.time()
+    last_idx = len(loader) - 1
+    num_updates = epoch * len(loader)
+    for batch_idx, (input, target) in enumerate(loader):
+        last_batch = batch_idx == last_idx
+        data_time_m.update(time.time() - end)
+        if not args.prefetcher:
+            input, target = input.to(device), target.to(device)
+            if mixup_fn is not None:
+                input, target = mixup_fn(input, target)
+            else:
+                # handle token_label without mixup
+                if args.token_label and args.token_label_data:
+                    target=create_token_label_target(target,num_classes=args.num_classes,
+                        smoothing=args.smoothing, label_size=args.token_label_size, device=device)
+                if len(target.shape)==1:
+                    target=create_token_label_target(target,num_classes=args.num_classes,
+                        smoothing=args.smoothing, device=device)
+        else:
+            if args.token_label and args.token_label_data and not loader.mixup_enabled:
+                target=create_token_label_target(target,num_classes=args.num_classes,
+                    smoothing=args.smoothing, label_size=args.token_label_size, device=device)
+            if len(target.shape)==1:
+                target=create_token_label_target(target,num_classes=args.num_classes,
+                    smoothing=args.smoothing, device=device)
+        if args.channels_last:
+            input = input.contiguous(memory_format=torch.channels_last)
+        # if batch_idx in [60, 61]:
+        #     import pdb
+        #     pdb.set_trace()
+        if batch_idx == 60:
+            with torch.autograd.profiler.profile(use_npu=True) as prof:
+                with amp_autocast():
+                    output = model(input)
+                    loss = loss_fn(output, target)
+
+                if not args.distributed:
+                    losses_m.update(loss.item(), input.size(0))
+
+                optimizer.zero_grad()
+
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward(create_graph=second_order)
+
+                optimizer.step()
+
+            prof.export_chrome_trace("output.prof")  # "output.prof"为输出文件地址
+        else:
+            with amp_autocast():
+                output = model(input)
+                loss = loss_fn(output, target)
+
+            if not args.distributed:
+                losses_m.update(loss.item(), input.size(0))
+
+            optimizer.zero_grad()
+
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward(create_graph=second_order)
+
+            optimizer.step()
+
+        if model_ema is not None:
+            model_ema.update(model)
+
+        torch.npu.synchronize()
+        num_updates += 1
+        batch_time_m.update(time.time() - end)
+        # if last_batch or batch_idx % args.log_interval == 0:
+        if last_batch or batch_idx % 50 == 0:
+            lrl = [param_group['lr'] for param_group in optimizer.param_groups]
+            lr = sum(lrl) / len(lrl)
+
+            if args.distributed:
+                reduced_loss = reduce_tensor(loss.data, args.world_size)
+                losses_m.update(reduced_loss.item(), input.size(0))
+
+            if args.is_master_node:
+                _logger.info(
+                    'Train: {} [{:>4d}/{} ({:>3.0f}%)]  '
+                    'Loss: {loss.val:>9.6f} ({loss.avg:>6.4f})  '
+                    'Time: {batch_time.val:.3f}s, {rate:>7.2f}/s  '
+                    '({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s)  '
+                    'LR: {lr:.3e}  '
+                    'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format(
+                        epoch,
+                        batch_idx, len(loader),
+                        100. * batch_idx / last_idx,
+                        loss=losses_m,
+                        batch_time=batch_time_m,
+                        rate=input.size(0) * args.world_size / batch_time_m.val,
+                        rate_avg=input.size(0) * args.world_size / batch_time_m.avg,
+                        lr=lr,
+                        data_time=data_time_m))
+
+                if args.save_images and output_dir:
+                    torchvision.utils.save_image(
+                        input,
+                        os.path.join(output_dir, 'train-batch-%d.jpg' % batch_idx),
+                        padding=0,
+                        normalize=True)
+
+        if saver is not None and args.recovery_interval and (
+                last_batch or (batch_idx + 1) % args.recovery_interval == 0):
+            saver.save_recovery(epoch, batch_idx=batch_idx)
+
+        if lr_scheduler is not None:
+            lr_scheduler.step_update(num_updates=num_updates, metric=losses_m.avg)
+
+        end = time.time()
+        # end for
+
+    if hasattr(optimizer, 'sync_lookahead'):
+        optimizer.sync_lookahead()
+
+    return OrderedDict([('loss', losses_m.avg)])
+
+
+def validate(device, model, loader, loss_fn, args, amp_autocast=suppress, log_suffix=''):
+    batch_time_m = AverageMeter()
+    losses_m = AverageMeter()
+    top1_m = AverageMeter()
+    top5_m = AverageMeter()
+
+    model.eval()
+
+    end = time.time()
+    last_idx = len(loader) - 1
+    with torch.no_grad():
+        for batch_idx, (input, target) in enumerate(loader):
+            last_batch = batch_idx == last_idx
+            if not args.prefetcher:
+                input = input.to(device)
+                target = target.to(device)
+            if args.channels_last:
+                input = input.contiguous(memory_format=torch.channels_last)
+
+            with amp_autocast():
+                output = model(input)
+            if isinstance(output, (tuple, list)):
+                output = output[0]
+                if args.cls_weight==0:
+                    output=output[1].mean(1)
+
+            # augmentation reduction
+            reduce_factor = args.tta
+            if reduce_factor > 1:
+                output = output.unfold(0, reduce_factor, reduce_factor).mean(dim=2)
+                target = target[0:target.size(0):reduce_factor]
+
+            loss = loss_fn(output, target)
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+
+            if args.distributed:
+                reduced_loss = reduce_tensor(loss.data, args.world_size)
+                acc1 = reduce_tensor(acc1, args.world_size)
+                acc5 = reduce_tensor(acc5, args.world_size)
+            else:
+                reduced_loss = loss.data
+
+            torch.npu.synchronize()
+
+            losses_m.update(reduced_loss.item(), input.size(0))
+            top1_m.update(acc1.item(), output.size(0))
+            top5_m.update(acc5.item(), output.size(0))
+
+            batch_time_m.update(time.time() - end)
+            end = time.time()
+            if args.is_master_node and (last_batch or batch_idx % args.log_interval == 0):
+                log_name = 'Test' + log_suffix
+                _logger.info(
+                    '{0}: [{1:>4d}/{2}]  '
+                    'Time: {batch_time.val:.3f} ({batch_time.avg:.3f})  '
+                    'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f})  '
+                    'Acc@1: {top1.val:>7.4f} ({top1.avg:>7.4f})  '
+                    'Acc@5: {top5.val:>7.4f} ({top5.avg:>7.4f})'.format(
+                        log_name, batch_idx, last_idx, batch_time=batch_time_m,
+                        loss=losses_m, top1=top1_m, top5=top5_m))
+
+    metrics = OrderedDict([('loss', losses_m.avg), ('top1', top1_m.avg), ('top5', top5_m.avg)])
+
+    return metrics
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/classification/LVVIT/modelzoo_level.txt b/PyTorch/contrib/cv/classification/LVVIT/modelzoo_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..31529da2e68f25b61e2a3e698a07537281443c03
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/modelzoo_level.txt
@@ -0,0 +1,3 @@
+FuncStatus:OK
+PerfStatus:OK
+PrecisionStatus:OK
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LVVIT/requirements.txt b/PyTorch/contrib/cv/classification/LVVIT/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5752b6818787e99f5243d5e9409ac34f25f7f9da
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/requirements.txt
@@ -0,0 +1,20 @@
+apex==0.1+ascend
+attr==0.3.1
+attrs==21.2.0
+h5py==2.8.0
+matplotlib==3.5.1
+numpy==1.21.4
+packaging==21.3
+Pillow==8.4.0
+PyYAML==6.0
+scikit-learn==0.24.2
+scipy==1.7.2
+setuptools==40.4.3
+six==1.16.0
+tabulate==0.8.9
+te==0.4.0
+timm==0.4.5
+torch==1.5.0+ascend
+torchvision==0.6.0a0
+tqdm==4.19.9
+wheel== 0.32.1
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LVVIT/seg/configs/lvvit/upernet_lvvit_l_512x512_160k_ade20k.py b/PyTorch/contrib/cv/classification/LVVIT/seg/configs/lvvit/upernet_lvvit_l_512x512_160k_ade20k.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4e28c978bc253d65d022982913c3c3d56752b0d
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/seg/configs/lvvit/upernet_lvvit_l_512x512_160k_ade20k.py
@@ -0,0 +1,78 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+_base_ = ['../_base_/datasets/ade20k.py','../_base_/default_runtime.py']
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='',
+    backbone=dict(
+        type='ViT',
+        img_size=(448,448),
+        depth=24,
+        out_channels=2048,
+        out_indices=(14, 17, 20, 23),
+        patch_size=16,
+        drop_path_rate=0.5,
+        embed_dim=768,
+        num_heads=12,
+        mlp_ratio=3.,
+        qkv_bias=False,
+        p_emb='4_2',
+        stem_dim=128,
+        use_side_layer=True),
+    decode_head=dict(
+        type='UPerHead',
+        in_channels=[768, 768, 768, 2048],
+        in_index=[0, 1, 2, 3],
+        pool_scales=(1, 2, 3, 6),
+        channels=512,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=768,
+        in_index=2,
+        channels=512,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(341, 341)))
+    # test_cfg=dict(mode='whole'))
+
+
+optimizer = dict(type='AdamW', lr=6e-5, betas=(0.9, 0.999), weight_decay=0.01)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', warmup='linear', warmup_iters=1500, warmup_ratio=1e-6, power=1.0, min_lr=0., by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=160000)
+checkpoint_config = dict(by_epoch=False, interval=8000)
+evaluation = dict(interval=8000, metric='mIoU')
+data=dict(samples_per_gpu=2)
+fp16=dict()
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LVVIT/seg/configs/lvvit/upernet_lvvit_m_512x512_160k_ade20k.py b/PyTorch/contrib/cv/classification/LVVIT/seg/configs/lvvit/upernet_lvvit_m_512x512_160k_ade20k.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4ea6ec82d6d937017581aec672a97274b7253c6
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/seg/configs/lvvit/upernet_lvvit_m_512x512_160k_ade20k.py
@@ -0,0 +1,76 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+_base_ = ['../_base_/datasets/ade20k.py','../_base_/default_runtime.py']
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='',
+    backbone=dict(
+        type='ViT',
+        img_size=(512,512),
+        depth=20,
+        out_channels=2048,
+        out_indices=(10, 13, 16, 19),
+        patch_size=16,
+        drop_path_rate=0.2,
+        embed_dim=512,
+        num_heads=8,
+        mlp_ratio=3.,
+        qkv_bias=False,
+        p_emb='4_2',
+        stem_dim=64,
+        use_side_layer=True,
+        skip_lam=2.0),
+    decode_head=dict(
+        type='UPerHead',
+        in_channels=[512, 512, 512, 2048],
+        in_index=[0, 1, 2, 3],
+        pool_scales=(1, 2, 3, 6),
+        channels=256,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=512,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(341, 341)))
+
+optimizer = dict(type='AdamW', lr=6e-5, betas=(0.9, 0.999), weight_decay=0.01)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', warmup='linear', warmup_iters=1500, warmup_ratio=1e-6, power=1.0, min_lr=0., by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=160000)
+checkpoint_config = dict(by_epoch=False, interval=8000)
+evaluation = dict(interval=8000, metric='mIoU')
+fp16=dict()
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LVVIT/seg/configs/lvvit/upernet_lvvit_s_512x512_160k_ade20k.py b/PyTorch/contrib/cv/classification/LVVIT/seg/configs/lvvit/upernet_lvvit_s_512x512_160k_ade20k.py
new file mode 100644
index 0000000000000000000000000000000000000000..5140dd5bf4793060c4b405b8b8b930302ac57cad
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/seg/configs/lvvit/upernet_lvvit_s_512x512_160k_ade20k.py
@@ -0,0 +1,76 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+_base_ = ['../_base_/datasets/ade20k.py', '../_base_/default_runtime.py']
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='',
+    backbone=dict(
+        type='ViT',
+        img_size=(512,512),
+        depth=16,
+        out_channels=2048,
+        out_indices=(6, 9, 12, 15),
+        patch_size=16,
+        drop_path_rate=0.1,
+        embed_dim=384,
+        num_heads=6,
+        mlp_ratio=3.,
+        qkv_bias=False,
+        p_emb='4_2',
+        stem_dim=64,
+        use_side_layer=True,
+        skip_lam=2.0),
+    decode_head=dict(
+        type='UPerHead',
+        in_channels=[384, 384, 384, 2048],
+        in_index=[0, 1, 2, 3],
+        pool_scales=(1, 2, 3, 6),
+        channels=256,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=384,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(341, 341)))
+    # test_cfg=dict(mode='whole'))
+optimizer = dict(type='AdamW', lr=6e-5, betas=(0.9, 0.999), weight_decay=0.01)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', warmup='linear', warmup_iters=1500, warmup_ratio=1e-6, power=1.0, min_lr=0., by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=160000)
+checkpoint_config = dict(by_epoch=False, interval=8000)
+evaluation = dict(interval=8000, metric='mIoU')
+fp16=dict()
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LVVIT/seg/mmseg/models/backbones/vit.py b/PyTorch/contrib/cv/classification/LVVIT/seg/mmseg/models/backbones/vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..9939c2353610a3fb84b294b5ccef8a41a0d7131a
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/seg/mmseg/models/backbones/vit.py
@@ -0,0 +1,605 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+"""
+Modified from https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/backbones/vit.py
+"""
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import (Conv2d, Linear, build_activation_layer, build_norm_layer,
+                      constant_init, kaiming_init, normal_init)
+from mmcv.runner import _load_checkpoint
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from mmseg.utils import get_root_logger
+from ..builder import BACKBONES
+from ..utils import DropPath, trunc_normal_
+from functools import partial
+from itertools import repeat
+import collections.abc
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+
+to_2tuple = _ntuple(2)
+class Mlp(nn.Module):
+    """MLP layer for Encoder block.
+
+    Args:
+        in_features(int): Input dimension for the first fully
+            connected layer.
+        hidden_features(int): Output dimension for the first fully
+            connected layer.
+        out_features(int): Output dementsion for the second fully
+            connected layer.
+        act_cfg(dict): Config dict for activation layer.
+            Default: dict(type='GELU').
+        drop(float): Drop rate for the dropout layer. Dropout rate has
+            to be between 0 and 1. Default: 0.
+    """
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_cfg=dict(type='GELU'),
+                 drop=0.):
+        super(Mlp, self).__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = Linear(in_features, hidden_features)
+        self.act = build_activation_layer(act_cfg)
+        self.fc2 = Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    """Attention layer for Encoder block.
+
+    Args:
+        dim (int): Dimension for the input vector.
+        num_heads (int): Number of parallel attention heads.
+        qkv_bias (bool): Enable bias for qkv if True. Default: False.
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        attn_drop (float): Drop rate for attention output weights.
+            Default: 0.
+        proj_drop (float): Drop rate for output weights. Default: 0.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super(Attention, self).__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        b, n, c = x.shape
+        qkv = self.qkv(x).reshape(b, n, 3, self.num_heads,
+                                  c // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1).float().cpu()
+        attn = self.attn_drop(attn).npu().half()
+
+        x = (attn @ v).transpose(1, 2).reshape(b, n, c)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+    """Implements encoder block with residual connection.
+
+    Args:
+        dim (int): The feature dimension.
+        num_heads (int): Number of parallel attention heads.
+        mlp_ratio (int): Ratio of mlp hidden dim to embedding dim.
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float): Drop rate for mlp output weights. Default: 0.
+        attn_drop (float): Drop rate for attention output weights.
+            Default: 0.
+        proj_drop (float): Drop rate for attn layer output weights.
+            Default: 0.
+        drop_path (float): Drop rate for paths of model.
+            Default: 0.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN', requires_grad=True).
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        skip_lam (float): residual connection factor. Default: 1.0
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 drop_path=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN', eps=1e-6),
+                 with_cp=False,
+                 skip_lam=1.0):
+        super(Block, self).__init__()
+        self.with_cp = with_cp
+        _, self.norm1 = build_norm_layer(norm_cfg, dim)
+        self.attn = Attention(dim, num_heads, qkv_bias, qk_scale, attn_drop,
+                              proj_drop)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        _, self.norm2 = build_norm_layer(norm_cfg, dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_cfg=act_cfg,
+            drop=drop)
+        self.skip_lam=skip_lam
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x + self.drop_path(self.attn(self.norm1(x)))/self.skip_lam
+            out = out + self.drop_path(self.mlp(self.norm2(out)))/self.skip_lam
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding.
+
+    Args:
+        img_size (int | tuple): Input image size.
+            default: 224.
+        patch_size (int): Width and height for a patch.
+            default: 16.
+        in_channels (int): Input channels for images. Default: 3.
+        embed_dim (int): The embedding dimension. Default: 768.
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dim=768):
+        super(PatchEmbed, self).__init__()
+        if isinstance(img_size, int):
+            self.img_size = (img_size, img_size)
+        elif isinstance(img_size, tuple):
+            self.img_size = img_size
+        else:
+            raise TypeError('img_size must be type of int or tuple')
+        h, w = self.img_size
+        self.patch_size = (patch_size, patch_size)
+        self.num_patches = (h // patch_size) * (w // patch_size)
+        self.proj = Conv2d(
+            in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        return self.proj(x)
+
+class PatchEmbed4_2(nn.Module):
+    """ 
+    Image to Patch Embedding with 4 layer convolution
+    """
+    def __init__(self, img_size=224, patch_size=16, in_channels=3, embed_dim=768, stem_dim=64):
+        super().__init__()
+        new_patch_size = to_2tuple(patch_size // 2)
+
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.embed_dim = embed_dim
+
+        self.conv1 = nn.Conv2d(in_channels, stem_dim, kernel_size=7, stride=2, padding=3, bias=False)  # 112x112
+        self.bn1 = nn.BatchNorm2d(stem_dim)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(stem_dim, stem_dim, kernel_size=3, stride=1, padding=1, bias=False)  # 112x112
+        self.bn2 = nn.BatchNorm2d(stem_dim)
+        self.conv3 = nn.Conv2d(stem_dim, stem_dim, kernel_size=3, stride=1, padding=1, bias=False)  
+        self.bn3 = nn.BatchNorm2d(stem_dim)
+
+        self.proj = nn.Conv2d(stem_dim, embed_dim, kernel_size=new_patch_size, stride=new_patch_size)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+
+        x = self.conv3(x)
+        x = self.bn3(x)
+        x = self.relu(x)
+
+        x = self.proj(x)  # [B, C, W, H]
+
+        return x
+
+@BACKBONES.register_module()
+class VisionTransformer(nn.Module):
+    """Vision transformer backbone.
+
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for
+        Image Recognition at Scale` - https://arxiv.org/abs/2010.11929
+
+    Args:
+        img_size (tuple): input image size. Default: (224, 224).
+        patch_size (int, tuple): patch size. Default: 16.
+        in_channels (int): number of input channels. Default: 3.
+        embed_dim (int): embedding dimension. Default: 768.
+        depth (int): depth of transformer. Default: 12.
+        num_heads (int): number of attention heads. Default: 12.
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        out_indices (list | tuple | int): Output from which stages.
+            Default: -1.
+        qkv_bias (bool): enable bias for qkv if True. Default: True.
+        qk_scale (float): override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): dropout rate. Default: 0.
+        attn_drop_rate (float): attention dropout rate. Default: 0.
+        drop_path_rate (float): Rate of DropPath. Default: 0.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN', eps=1e-6, requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='GELU').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        final_norm (bool):  Whether to add a additional layer to normalize
+            final feature map. Default: False.
+        out_reshape (str): Select the output format of feature information.
+            Default: NCHW.
+        interpolate_mode (str): Select the interpolate mode for position
+            embeding vector resize. Default: bicubic.
+        with_cls_token (bool): If concatenating class token into image tokens
+            as transformer input. Default: True.
+        with_cp (bool): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        p_emb (str): Which Patch Embedding to use.
+            Default: None, using naive Patch Embedding.
+        stem_dim (int): hidden dim in Patch Embedding module.
+            Default: 64.
+        skip_lam (float): residual connection factor.
+            Default: 1.0.
+        use_side_layer (bool): whether use the side layer for UperNet and FCN.
+            Default: False (use the neck instead)
+        fcn (bool): switch between FCN and UperNet. 
+            Default: False (use UperNet).
+    """
+
+    def __init__(self,
+                 img_size=(224, 224),
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 out_indices=11,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_cfg=dict(type='LN', eps=1e-6, requires_grad=True),
+                 act_cfg=dict(type='GELU'),
+                 norm_eval=False,
+                 final_norm=False,
+                 out_shape='NCHW',
+                 with_cls_token=True,
+                 interpolate_mode='bicubic',
+                 with_cp=False,
+                 out_channels=768,
+                 p_emb=None,
+                 stem_dim=64,
+                 skip_lam=1.0,
+                 use_side_layer=False,
+                 fcn=False):
+        super(VisionTransformer, self).__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.features = self.embed_dim = embed_dim
+        if p_emb=='4_2':
+            patch_embed_fn = partial(PatchEmbed4_2,stem_dim=stem_dim)
+        else:
+            patch_embed_fn = PatchEmbed
+        self.patch_embed = patch_embed_fn(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=embed_dim)
+
+        self.with_cls_token = with_cls_token
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, self.patch_embed.num_patches + 1, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        self.num_patches = self.patch_embed.num_patches
+        if isinstance(out_indices, int):
+            self.out_indices = [out_indices]
+        elif isinstance(out_indices, list) or isinstance(out_indices, tuple):
+            self.out_indices = out_indices
+        else:
+            raise TypeError('out_indices must be type of int, list or tuple')
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=dpr[i],
+                attn_drop=attn_drop_rate,
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                with_cp=with_cp,
+                skip_lam=skip_lam) for i in range(depth)
+        ])
+        self.use_side_layer = use_side_layer
+        if use_side_layer:
+            if not fcn:
+                self.side_layer1 = nn.Sequential(
+                        nn.ConvTranspose2d(embed_dim, embed_dim, 4, stride=4, padding=0, bias=False),
+                        nn.SyncBatchNorm(embed_dim),
+                        nn.ReLU(True),
+                        nn.Conv2d(embed_dim, embed_dim, 1, 1, 0, bias=False),
+                        nn.SyncBatchNorm(embed_dim),
+                        nn.ReLU(True),
+                )
+                self.side_layer2 = nn.Sequential(
+                        nn.ConvTranspose2d(embed_dim, embed_dim, 2, stride=2, padding=0, bias=False),
+                        nn.SyncBatchNorm(embed_dim),
+                        nn.ReLU(True),
+                        nn.Conv2d(embed_dim, embed_dim, 1, 1, 0, bias=False),
+                        nn.SyncBatchNorm(embed_dim),
+                        nn.ReLU(True),
+                )
+                self.side_layer3 = nn.Sequential(
+                        nn.Conv2d(embed_dim, embed_dim, 1, 1, 0, bias=False),
+                        nn.SyncBatchNorm(embed_dim),
+                        nn.ReLU(True),
+                )
+                self.side_layer4 = nn.Sequential(
+                        nn.Conv2d(embed_dim, out_channels, 1, 1, 0, bias=False),
+                        nn.SyncBatchNorm(out_channels),
+                        nn.ReLU(True),
+                )
+            else:
+                self.side_layer1 = nn.Identity()
+                self.side_layer2 = nn.Identity()
+                self.side_layer3 = nn.Sequential(
+                        nn.Conv2d(embed_dim, embed_dim, 1, 1, 0, bias=False),
+                        nn.SyncBatchNorm(embed_dim),
+                        nn.ReLU(True),
+                )
+                self.side_layer4 = nn.Sequential(
+                        nn.ConvTranspose2d(embed_dim, embed_dim, 2, stride=2, padding=0, bias=False),
+                        nn.SyncBatchNorm(embed_dim),
+                        nn.ReLU(True),
+                        nn.Conv2d(embed_dim, out_channels, 1, 1, 0, bias=False),
+                        nn.SyncBatchNorm(out_channels),
+                        nn.ReLU(True),
+                )
+        assert out_shape in ['NLC',
+                             'NCHW'], 'output shape must be "NLC" or "NCHW".'
+
+        self.out_shape = out_shape
+
+        self.interpolate_mode = interpolate_mode
+        self.final_norm = final_norm
+        if final_norm:
+            _, self.norm = build_norm_layer(norm_cfg, embed_dim)
+
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            checkpoint = _load_checkpoint(pretrained, logger=logger)
+            if 'state_dict' in checkpoint:
+                state_dict = checkpoint['state_dict']
+            elif 'model' in checkpoint:
+                state_dict = checkpoint['model']
+            else:
+                state_dict = checkpoint
+
+            if 'pos_embed' in state_dict.keys():
+                if self.pos_embed.shape != state_dict['pos_embed'].shape:
+                    logger.info(msg=f'Resize the pos_embed shape from \
+{state_dict["pos_embed"].shape} to {self.pos_embed.shape}')
+                    h, w = self.img_size
+                    pos_size = int(
+                        math.sqrt(state_dict['pos_embed'].shape[1] - 1))
+                    state_dict['pos_embed'] = self.resize_pos_embed(
+                        state_dict['pos_embed'], (h, w), (pos_size, pos_size),
+                        self.patch_size, self.interpolate_mode)
+
+            self.load_state_dict(state_dict, False)
+
+        elif pretrained is None:
+            # We only implement the 'jax_impl' initialization implemented at
+            # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353  # noqa: E501
+            trunc_normal_(self.pos_embed, std=.02)
+            trunc_normal_(self.cls_token, std=.02)
+            for n, m in self.named_modules():
+                if isinstance(m, Linear):
+                    trunc_normal_(m.weight, std=.02)
+                    if m.bias is not None:
+                        if 'mlp' in n:
+                            normal_init(m.bias, std=1e-6)
+                        else:
+                            constant_init(m.bias, 0)
+                elif isinstance(m, Conv2d):
+                    kaiming_init(m.weight, mode='fan_in')
+                    if m.bias is not None:
+                        constant_init(m.bias, 0)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
+                    constant_init(m.bias, 0)
+                    constant_init(m.weight, 1.0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    @staticmethod
+    def resize_pos_embed(pos_embed, input_shpae, pos_shape, patch_size, mode):
+        """Resize pos_embed weights.
+
+        Resize pos_embed using bicubic interpolate method.
+        Args:
+            pos_embed (torch.Tensor): pos_embed weights.
+            input_shpae (tuple): Tuple for (input_h, intput_w).
+            pos_shape (tuple): Tuple for (pos_h, pos_w).
+            patch_size (int): Patch size.
+        Return:
+            torch.Tensor: The resized pos_embed of shape [B, L_new, C]
+        """
+        assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]'
+        input_h, input_w = input_shpae
+        pos_h, pos_w = pos_shape
+        cls_token_weight = pos_embed[:, 0]
+        pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):]
+        pos_embed_weight = pos_embed_weight.reshape(
+            1, pos_h, pos_w, pos_embed.shape[2]).permute(0, 3, 1, 2)
+        pos_embed_weight = F.interpolate(
+            pos_embed_weight,
+            size=[input_h // patch_size, input_w // patch_size],
+            align_corners=False,
+            mode=mode)
+        cls_token_weight = cls_token_weight.unsqueeze(1)
+        pos_embed_weight = torch.flatten(pos_embed_weight, 2).transpose(1, 2)
+        pos_embed = torch.cat((cls_token_weight, pos_embed_weight), dim=1)
+        return pos_embed
+
+    def _pos_embeding(self, x, h, w):
+        """Positiong embeding method.
+
+        Resize the pos_embed, if the input image size doesn't match
+            the training size.
+        Args:
+            x (torch.Tensor): The pos_embed weighs, it should be
+                shape of [B, L2, c].
+            h (int): training feature map height
+            w (int): training feature map width
+        Return:
+            torch.Tensor: The pos encoded image feature.
+        """
+        B,_,C = x.size()
+        ct = x[:,0].unsqueeze(2)
+        ts = x[:,1:].transpose(1, 2).reshape(B, C, int(self.num_patches ** 0.5), int(self.num_patches ** 0.5))
+        ts = F.interpolate(ts, (h, w), mode='bicubic', align_corners=False)
+        ts = ts.flatten(2)
+        x = torch.cat([ct, ts], dim=2).transpose(1, 2)
+        return x
+    def forward(self, inputs):
+        B = inputs.shape[0]
+
+        x = self.patch_embed(inputs)
+        B, C, H, W = x.size()
+        x = x.flatten(2).transpose(1, 2)
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        #x = self._pos_embeding(inputs, x, self.pos_embed)
+        x = x + self._pos_embeding(self.pos_embed, H, W)
+
+        if not self.with_cls_token:
+            # Remove class token for transformer input
+            x = x[:, 1:]
+
+        outs = []
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i == len(self.blocks) - 1:
+                if self.final_norm:
+                    x = self.norm(x)
+            if i in self.out_indices:
+                if self.with_cls_token:
+                    # Remove class token and reshape token for decoder head
+                    out = x[:, 1:]
+                else:
+                    out = x
+                if self.out_shape == 'NCHW':
+                    B, _, C = out.shape
+                    out = out.reshape(B, H, W, C).permute(0, 3, 1, 2)
+                outs.append(out)
+        if self.use_side_layer:
+            outs[0] = self.side_layer1(outs[0])
+            outs[1] = self.side_layer2(outs[1])
+            outs[2] = self.side_layer3(outs[2])
+            outs[3] = self.side_layer4(outs[3])
+        return tuple(outs)
+
+    def train(self, mode=True):
+        super(VisionTransformer, self).train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.LayerNorm):
+                    m.eval()
+
+@BACKBONES.register_module()
+class ViT(VisionTransformer):
+    def __init__(self, **kwargs):
+        super(ViT, self).__init__(**kwargs)
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LVVIT/setup.py b/PyTorch/contrib/cv/classification/LVVIT/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..132fab40f9d587de540f6a447c8cbcb4c0bd6f1c
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/setup.py
@@ -0,0 +1,59 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+from setuptools import setup, find_packages
+from codecs import open
+from os import path
+
+here = path.abspath(path.dirname(__file__))
+# Get the long description from the README file
+with open(path.join(here, 'README.md'), encoding='utf-8') as f:
+    long_description = f.read()
+
+setup(
+  name = 'tlt',
+  packages = find_packages(exclude=['seg','visualize']),
+  version = '0.2.0',
+  license='Apache License 2.0',
+  long_description=long_description,
+  long_description_content_type='text/markdown',
+  description = 'Token Labeling Toolbox for training image models',
+  author = 'Zihang Jiang',
+  author_email = 'jzh0103@gmail.com',
+  url = 'https://github.com/zihangJiang/TokenLabeling',
+  keywords = [
+    'imagenet',
+    'attention mechanism',
+    'transformer',
+    'image classification',
+    'token labeling'
+  ],
+  install_requires=[
+    'timm>=0.4.5',
+    'torch>=1.5',
+    'torchvision',
+    'scipy',
+  ],
+  classifiers=[
+    'Development Status :: 3 - Alpha',
+    'Intended Audience :: Science/Research',
+    'Topic :: Scientific/Engineering :: Artificial Intelligence',
+    'License :: OSI Approved :: Apache Software License',
+    'Programming Language :: Python :: 3.6',
+    'Programming Language :: Python :: 3.7',
+    'Programming Language :: Python :: 3.8',
+  ],
+)
diff --git a/PyTorch/contrib/cv/classification/LVVIT/test/env_npu.sh b/PyTorch/contrib/cv/classification/LVVIT/test/env_npu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7cabb607aab44bf7262886a61767b11470c03559
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/test/env_npu.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+export install_path=/usr/local/Ascend
+
+if [ -d ${install_path}/toolkit ]; then
+    export LD_LIBRARY_PATH=${install_path}/fwkacllib/lib64/:/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH}
+    export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH
+    export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH
+    export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH
+    export ASCEND_OPP_PATH=${install_path}/opp
+else
+    if [ -d ${install_path}/nnae/latest ];then
+        export LD_LIBRARY_PATH=${install_path}/nnae/latest/fwkacllib/lib64/:/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/nnae/latest
+    else
+        export LD_LIBRARY_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest
+    fi
+fi
+
+${install_path}/driver/tools/msnpureport -g error -d 0
+${install_path}/driver/tools/msnpureport -g error -d 1
+${install_path}/driver/tools/msnpureport -g error -d 2
+${install_path}/driver/tools/msnpureport -g error -d 3
+${install_path}/driver/tools/msnpureport -g error -d 4
+${install_path}/driver/tools/msnpureport -g error -d 5
+${install_path}/driver/tools/msnpureport -g error -d 6
+${install_path}/driver/tools/msnpureport -g error -d 7
+
+#将Host日志输出到串口,0-关闭/1-开启
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+#设置默认日志级别,0-debug/1-info/2-warning/3-error
+export ASCEND_GLOBAL_LOG_LEVEL=3
+#设置Event日志开启标志,0-关闭/1-开启
+export ASCEND_GLOBAL_EVENT_ENABLE=0
+#设置是否开启taskque,0-关闭/1-开启
+export TASK_QUEUE_ENABLE=1
+#设置是否开启PTCopy,0-关闭/1-开启
+export PTCOPY_ENABLE=1
+#设置是否开启combined标志,0-关闭/1-开启
+export COMBINED_ENABLE=1
+#设置特殊场景是否需要重新编译,不需要修改
+export DYNAMIC_OP="ADD#MUL"
+#HCCL白名单开关,1-关闭/0-开启
+export HCCL_WHITELIST_DISABLE=1
+
+ulimit -SHn 512000
+
+path_lib=$(python3.7 -c """
+import sys
+import re
+result=''
+for index in range(len(sys.path)):
+    match_sit = re.search('-packages', sys.path[index])
+    if match_sit is not None:
+        match_lib = re.search('lib', sys.path[index])
+
+        if match_lib is not None:
+            end=match_lib.span()[1]
+            result += sys.path[index][0:end] + ':'
+
+        result+=sys.path[index] + '/torch/lib:'
+print(result)"""
+)
+
+echo ${path_lib}
+
+export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH
diff --git a/PyTorch/contrib/cv/classification/LVVIT/test/train_eval_8p.sh b/PyTorch/contrib/cv/classification/LVVIT/test/train_eval_8p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0d66aee3da9a528ac7699cafcc715d3869870c65
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/test/train_eval_8p.sh
@@ -0,0 +1,15 @@
+source env_npu.sh
+if [ ! $1 ];
+then
+    DATA_DIR=/path/to/imagenet/val
+else
+    DATA_DIR="$1"
+fi
+if [ ! $2 ];
+then
+    MODEL_DIR=/path/to/checkpoint
+else
+    MODEL_DIR="$2"
+fi
+python3 validate.py $DATA_DIR  --model lvvit_s --checkpoint $MODEL_DIR/lvvit_s-26m-224-83.3.pth.tar --no-test-pool --amp  -b 64
+#python3 validate.py $DATA_DIR  --model lvvit_s --checkpoint $MODEL_DIR/lvvit_s-26m-224-83.3.pth.tar --no-test-pool --amp --img-size 224 -b 64
diff --git a/PyTorch/contrib/cv/classification/LVVIT/test/train_finetune_1p.sh b/PyTorch/contrib/cv/classification/LVVIT/test/train_finetune_1p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fdf6cc4e908c9934e91068031a9371fe29e85ffe
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/test/train_finetune_1p.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="LVVIT_for_PyTorch"
+# 训练batch_size
+batch_size=256
+# 训练使用的npu卡数
+export RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path="/opt/npu/imagenet"
+
+# 训练epoch
+train_epochs=300
+# 学习率
+learning_rate=1.6e-3
+# 加载数据进程数
+workers=32
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --workers* ]];then
+        workers=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+source ./test/env_npu.sh
+if [ -e "nohup.out" ]; then
+  rm -f nohup.out
+fi
+RANK_ID=0
+
+nohup python3  main.py $1 \
+    --device_id ${RANK_ID} \
+    --model lvvit_s \
+    -b 256 \
+    --apex-amp \
+    --img-size 224 \
+    --drop-path 0.1 \
+    --workers 32 \
+    --token-label \
+    --token-label-data $2 \
+    --token-label-size 14 \
+    --model-ema \
+    --no-prefetcher \
+    --finetune $3 &
+wait
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+FPS=`cat  nohup.out| grep '/s)' |tail -n 1 |awk -F '/s' '{print$1}'|awk -F ', ' '{print$2}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`cat  nohup.out| grep 'Best metric:'|awk -F ':' '{print$2}'|awk -F ' ' '{print$1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+cat nohup.out | grep "Loss:" | awk '{print $5}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LVVIT/test/train_full_1p.sh b/PyTorch/contrib/cv/classification/LVVIT/test/train_full_1p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0e9b68662165a32713c7a4e165ebbd02aab6acff
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/test/train_full_1p.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="LVVIT_for_PyTorch"
+# 训练batch_size
+batch_size=256
+# 训练使用的npu卡数
+export RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path="/opt/npu/imagenet"
+
+# 训练epoch
+train_epochs=300
+# 学习率
+learning_rate=1.6e-3
+# 加载数据进程数
+workers=32
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --workers* ]];then
+        workers=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+source ./test/env_npu.sh
+if [ -e "nohup.out" ]; then
+  rm -f nohup.out
+fi
+RANK_ID=0
+
+nohup python3  main.py $1 \
+    --device_id ${RANK_ID} \
+    --model lvvit_s \
+    -b 256 \
+    --epochs 300 \
+    --apex-amp \
+    --img-size 224 \
+    --drop-path 0.1 \
+    --workers 32 \
+    --token-label \
+    --token-label-data $2 \
+    --token-label-size 14 \
+    --model-ema \
+    --no-prefetcher &
+
+wait
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+FPS=`cat  nohup.out| grep '/s)' |tail -n 1 |awk -F '/s' '{print$1}'|awk -F ', ' '{print$2}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`cat  nohup.out| grep 'Best metric:'|awk -F ':' '{print$2}'|awk -F ' ' '{print$1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+cat nohup.out | grep "Loss:" | awk '{print $5}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LVVIT/test/train_full_8p.sh b/PyTorch/contrib/cv/classification/LVVIT/test/train_full_8p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d071d1d27310250870882026271a46f072652355
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/test/train_full_8p.sh
@@ -0,0 +1,143 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="LVVIT_for_PyTorch"
+# 训练batch_size
+batch_size=256
+# 训练使用的npu卡数
+export RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path="/opt/npu/imagenet"
+
+# 训练epoch
+train_epochs=300
+# 学习率
+learning_rate=1.6e-3
+# 加载数据进程数
+workers=32
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --workers* ]];then
+        workers=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+export RANK_SIZE=8
+KERNEL_NUM=$(($(nproc)/${RANK_SIZE}))
+
+source ./test/env_npu.sh
+if [ -e "nohup.out" ]; then
+  rm -f nohup.out
+fi
+
+for((RANK_ID=0;RANK_ID<$((RANK_SIZE));RANK_ID++));
+  do
+    export RANK_ID=$RANK_ID
+    PID_START=$((KERNEL_NUM*RANK_ID))
+    PID_END=$((PID_START+KERNEL_NUM-1))
+    taskset -c ${PID_START}-${PID_END} nohup python3 -u main.py $1 \
+    --distributed \
+    --device_id ${RANK_ID} \
+    --model lvvit_s \
+    -b 256 \
+    --epochs 300 \
+    --apex-amp \
+    --img-size 224 \
+    --drop-path 0.1 \
+    --workers 16 \
+    --token-label \
+    --token-label-data $2 \
+    --token-label-size 14 \
+    --model-ema \
+    --no-prefetcher &
+  done
+
+wait
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+FPS=`cat  nohup.out| grep '/s)' |tail -n 1 |awk -F '/s' '{print$1}'|awk -F ', ' '{print$2}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`cat  nohup.out| grep 'Best metric:'|awk -F ':' '{print$2}'|awk -F ' ' '{print$1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+cat nohup.out | grep "Loss:" | awk '{print $5}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LVVIT/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/LVVIT/test/train_performance_1p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ffdcf9be1d7872c1c7b3a19c67568a1c7d335180
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/test/train_performance_1p.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="LVVIT_for_PyTorch"
+# 训练batch_size
+batch_size=256
+# 训练使用的npu卡数
+export RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path="/opt/npu/imagenet"
+
+# 训练epoch
+train_epochs=300
+# 学习率
+learning_rate=1.6e-3
+# 加载数据进程数
+workers=32
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --workers* ]];then
+        workers=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+source ./test/env_npu.sh
+if [ -e "nohup.out" ]; then
+  rm -f nohup.out
+fi
+RANK_ID=0
+
+nohup python3  main.py $1 \
+    --device_id ${RANK_ID} \
+    --model lvvit_s \
+    -b 256 \
+    --epochs 2 \
+    --apex-amp \
+    --img-size 224 \
+    --drop-path 0.1 \
+    --workers 32 \
+    --token-label \
+    --token-label-data $2 \
+    --token-label-size 14 \
+    --model-ema \
+    --no-prefetcher &
+wait
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+FPS=`cat  nohup.out| grep '/s)' |tail -n 1 |awk -F '/s' '{print$1}'|awk -F ', ' '{print$2}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`cat  nohup.out| grep 'Best metric:'|awk -F ':' '{print$2}'|awk -F ' ' '{print$1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+#TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", "'"${batch_size}"'"*1000/"'"${FPS}"'"}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+cat nohup.out | grep "Loss:" | awk '{print $5}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+#echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LVVIT/test/train_performance_8p.sh b/PyTorch/contrib/cv/classification/LVVIT/test/train_performance_8p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ef5a37a803596a259e5957fe3f230c5b2a555bdb
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/test/train_performance_8p.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="LVVIT_for_PyTorch"
+# 训练batch_size
+batch_size=256
+# 训练使用的npu卡数
+export RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path="/opt/npu/imagenet"
+
+# 训练epoch
+train_epochs=300
+# 学习率
+learning_rate=1.6e-3
+# 加载数据进程数
+workers=32
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --workers* ]];then
+        workers=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+export RANK_SIZE=8
+KERNEL_NUM=$(($(nproc)/${RANK_SIZE}))
+
+source ./test/env_npu.sh
+if [ -e "nohup.out" ]; then
+  rm -f nohup.out
+fi
+
+for((RANK_ID=0;RANK_ID<$((RANK_SIZE));RANK_ID++));
+  do
+    export RANK_ID=$RANK_ID
+    PID_START=$((KERNEL_NUM*RANK_ID))
+    PID_END=$((PID_START+KERNEL_NUM-1))
+    taskset -c ${PID_START}-${PID_END} nohup python3 -u main.py $1 \
+    --distributed \
+    --device_id ${RANK_ID} \
+    --model lvvit_s \
+    -b 256 \
+    --epochs 2 \
+    --apex-amp \
+    --img-size 224 \
+    --drop-path 0.1 \
+    --workers 16 \
+    --token-label \
+    --token-label-data $2 \
+    --token-label-size 14 \
+    --model-ema \
+    --no-prefetcher &
+  done
+wait
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+FPS=`cat  nohup.out| grep '/s)' |tail -n 1 |awk -F '/s' '{print$1}'|awk -F ', ' '{print$2}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`cat  nohup.out| grep 'Best metric:'|awk -F ':' '{print$2}'|awk -F ' ' '{print$1}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+cat nohup.out | grep "Loss:" | awk '{print $5}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/__init__.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..40a242ff224b99725711c7ad963de7eec65f19b5
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/data/__init__.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aed0bc30900dbdd5f003e436cb5ae7bab4ce9f49
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+from .dataset import DatasetTokenLabel, create_token_label_dataset
+from .loader import create_token_label_loader
+from .label_transforms_factory import create_token_label_transform
+from .mixup import TokenLabelMixup, FastCollateTokenLabelMixup, mixup_target as create_token_label_target
+from .loader import create_loader
diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/data/dataset.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ddf6d9678eda51f6e6880917d107479b2130bbd
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/dataset.py
@@ -0,0 +1,143 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+""" Image dataset with label maps
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch.utils.data as data
+
+import os
+import re
+import torch
+import tarfile
+import logging
+from PIL import Image
+_logger = logging.getLogger('token_label_dataset')
+
+IMG_EXTENSIONS = ['.png', '.jpg', '.jpeg']
+
+
+def natural_key(string_):
+    """See http://www.codinghorror.com/blog/archives/001018.html"""
+    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())]
+
+
+def find_images_and_targets(folder, types=IMG_EXTENSIONS, class_to_idx=None, leaf_name_only=True, sort=True):
+    labels = []
+    filenames = []
+    for root, subdirs, files in os.walk(folder, topdown=False):
+        rel_path = os.path.relpath(root, folder) if (root != folder) else ''
+        label = os.path.basename(rel_path) if leaf_name_only else rel_path.replace(os.path.sep, '_')
+        for f in files:
+            base, ext = os.path.splitext(f)
+            if ext.lower() in types:
+                filenames.append(os.path.join(root, f))
+                labels.append(label)
+    if class_to_idx is None:
+        # building class index
+        unique_labels = set(labels)
+        sorted_labels = list(sorted(unique_labels, key=natural_key))
+        class_to_idx = {c: idx for idx, c in enumerate(sorted_labels)}
+    images_and_targets = [(f, class_to_idx[l]) for f, l in zip(filenames, labels) if l in class_to_idx]
+    if sort:
+        images_and_targets = sorted(images_and_targets, key=lambda k: natural_key(k[0]))
+    return images_and_targets, class_to_idx
+
+
+def load_class_map(filename, root=''):
+    class_map_path = filename
+    if not os.path.exists(class_map_path):
+        class_map_path = os.path.join(root, filename)
+        assert os.path.exists(class_map_path), 'Cannot locate specified class map file (%s)' % filename
+    class_map_ext = os.path.splitext(filename)[-1].lower()
+    if class_map_ext == '.txt':
+        with open(class_map_path) as f:
+            class_to_idx = {v.strip(): k for k, v in enumerate(f)}
+    else:
+        assert False, 'Unsupported class map extension'
+    return class_to_idx
+
+
+class DatasetTokenLabel(data.Dataset):
+
+    def __init__(
+            self,
+            root,
+            label_root,
+            load_bytes=False,
+            transform=None,
+            class_map=''):
+
+        class_to_idx = None
+        if class_map:
+            class_to_idx = load_class_map(class_map, root)
+        images, class_to_idx = find_images_and_targets(root, class_to_idx=class_to_idx)
+        if len(images) == 0:
+            raise RuntimeError(f'Found 0 images in subfolders of {root}. '
+                               f'Supported image extensions are {", ".join(IMG_EXTENSIONS)}')
+        self.root = root
+        self.label_root = label_root
+        self.samples = images
+        self.imgs = self.samples  # torchvision ImageFolder compat
+        self.class_to_idx = class_to_idx
+        self.load_bytes = load_bytes
+        self.transform = transform
+
+    def __getitem__(self, index):
+        path, target = self.samples[index]
+        score_path = os.path.join(
+            self.label_root,
+            '/'.join(path.split('/')[-2:]).split('.')[0] + '.pt')
+
+        img = open(path, 'rb').read() if self.load_bytes else Image.open(path).convert('RGB')
+        score_maps = torch.load(score_path).float()
+        if self.transform is not None:
+            img, score_maps = self.transform(img, score_maps)
+        # append ground truth after coords
+        score_maps[-1,0,0,5]=target
+        return img, score_maps
+
+    def __len__(self):
+        return len(self.samples)
+
+    def filename(self, index, basename=False, absolute=False):
+        filename = self.samples[index][0]
+        if basename:
+            filename = os.path.basename(filename)
+        elif not absolute:
+            filename = os.path.relpath(filename, self.root)
+        return filename
+
+    def filenames(self, basename=False, absolute=False):
+        fn = lambda x: x
+        if basename:
+            fn = os.path.basename
+        elif not absolute:
+            fn = lambda x: os.path.relpath(x, self.root)
+        return [fn(x[0]) for x in self.samples]
+
+
+def create_token_label_dataset(dataset_type, root, label_root):
+    train_dir = os.path.join(root, 'train')
+    if not os.path.exists(train_dir):
+        _logger.error('Training folder does not exist at: {}'.format(train_dir))
+        exit(1)
+    if not os.path.exists(label_root):
+        _logger.error('Label folder does not exist at: {}'.format(label_root))
+        exit(1)
+    return DatasetTokenLabel(train_dir, label_root)
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/data/label_transforms_factory.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/label_transforms_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..08a8414f7a679ef23fcd5eac536c70c9ada5ba57
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/label_transforms_factory.py
@@ -0,0 +1,234 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+""" Transforms Factory
+
+Adapted for token labeling
+"""
+import math
+
+import torch
+from torchvision import transforms
+
+from .random_augment_label import rand_augment_transform
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, DEFAULT_CROP_PCT
+from timm.data.transforms import _pil_interp, RandomResizedCropAndInterpolation, ToNumpy, ToTensor
+from timm.data.random_erasing import RandomErasing
+import random
+
+import torchvision
+from torchvision.transforms import functional as torchvision_F
+from PIL import Image
+
+_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC)
+
+class ComposeWithLabel(torchvision.transforms.Compose):
+    def __init__(self, **kwargs):
+        super(ComposeWithLabel, self).__init__(**kwargs)
+
+    def __call__(self, img, label_map):
+        for t in self.transforms:
+            if type(t).__name__ == 'RandomHorizontalFlipWithLabel':
+                img, label_map = t(img, label_map)
+            elif type(t).__name__ == 'RandomVerticalFlipWithLabel':
+                img, label_map = t(img, label_map)
+            elif type(t).__name__ == 'RandAugment':
+                img, label_map = t(img, label_map)
+            elif type(t).__name__ == 'RandomResizedCropAndInterpolationWithCoords':
+                # should ensure RandomResizedCropWithCoords after all trabsformation
+                img, label_map = t(img, label_map)
+            else:
+                img = t(img)
+        return img, label_map
+
+class RandomResizedCropAndInterpolationWithCoords(RandomResizedCropAndInterpolation):
+    def __init__(self, size, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.),
+                 interpolation='bilinear'):
+        if isinstance(size, tuple):
+            self.size = size
+        else:
+            self.size = (size, size)
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            warnings.warn("range should be of kind (min, max)")
+
+        if interpolation == 'random':
+            self.interpolation = _RANDOM_INTERPOLATION
+        else:
+            self.interpolation = _pil_interp(interpolation)
+        self.scale = scale
+        self.ratio = ratio
+
+    def __call__(self, img, label_map):
+        i, j, h, w = self.get_params(img, self.scale, self.ratio)
+        coords = (i / img.size[1],
+                  j / img.size[0],
+                  h / img.size[1],
+                  w / img.size[0])
+        coords_map = torch.zeros_like(label_map[0:1])
+        # trick to store coords_map is label_map
+        coords_map[0,0,0,0],coords_map[0,0,0,1],coords_map[0,0,0,2],coords_map[0,0,0,3] = coords
+        label_map = torch.cat([label_map, coords_map])
+        if isinstance(self.interpolation, (tuple, list)):
+            interpolation = random.choice(self.interpolation)
+        else:
+            interpolation = self.interpolation
+        return torchvision_F.resized_crop(img, i, j, h, w, self.size,
+                                 interpolation), label_map
+
+class RandomHorizontalFlipWithLabel(torchvision.transforms.RandomHorizontalFlip):
+    def __init__(self, **kwargs):
+        super(RandomHorizontalFlipWithLabel, self).__init__(**kwargs)
+
+    def __call__(self, img, label):
+        if torch.rand(1) < self.p:
+            return torchvision_F.hflip(img), label.flip(3)
+        return img, label
+
+class RandomVerticalFlipWithLabel(torchvision.transforms.RandomVerticalFlip):
+    def __init__(self, **kwargs):
+        super(RandomVerticalFlipWithLabel, self).__init__(**kwargs)
+
+    def __call__(self, img, label):
+        if torch.rand(1) < self.p:
+            return torchvision_F.vflip(img), label.flip(2)
+        return img, label
+
+
+def transforms_imagenet_train(
+        img_size=224,
+        scale=None,
+        ratio=None,
+        hflip=0.5,
+        vflip=0.,
+        color_jitter=0.4,
+        auto_augment=None,
+        interpolation='random',
+        use_prefetcher=False,
+        mean=IMAGENET_DEFAULT_MEAN,
+        std=IMAGENET_DEFAULT_STD,
+        re_prob=0.,
+        re_mode='const',
+        re_count=1,
+        re_num_splits=0,
+        separate=False,
+):
+    """
+    If separate==True, the transforms are returned as a tuple of 3 separate transforms
+    for use in a mixing dataset that passes
+     * all data through the first (primary) transform, called the 'clean' data
+     * a portion of the data through the secondary transform
+     * normalizes and converts the branches above with the third, final transform
+    """
+    scale = tuple(scale or (0.08, 1.0))  # default imagenet scale range
+    ratio = tuple(ratio or (3./4., 4./3.))  # default imagenet ratio range
+
+    primary_tfl=[]
+    if hflip > 0.:
+        primary_tfl += [RandomHorizontalFlipWithLabel(p=hflip)]
+    if vflip > 0.:
+        primary_tfl += [RandomVerticalFlipWithLabel(p=vflip)]
+
+    secondary_tfl = []
+    if auto_augment:
+        assert isinstance(auto_augment, str)
+        if isinstance(img_size, tuple):
+            img_size_min = min(img_size)
+        else:
+            img_size_min = img_size
+        aa_params = dict(
+            translate_const=int(img_size_min * 0.45),
+            img_mean=tuple([min(255, round(255 * x)) for x in mean]),
+        )
+        if interpolation and interpolation != 'random':
+            aa_params['interpolation'] = _pil_interp(interpolation)
+        if auto_augment.startswith('rand'):
+            secondary_tfl += [rand_augment_transform(auto_augment, aa_params)]
+
+    elif color_jitter is not None:
+        # color jitter is enabled when not using AA
+        if isinstance(color_jitter, (list, tuple)):
+            # color jitter should be a 3-tuple/list if spec brightness/contrast/saturation
+            # or 4 if also augmenting hue
+            assert len(color_jitter) in (3, 4)
+        else:
+            # if it's a scalar, duplicate for brightness, contrast, and saturation, no hue
+            color_jitter = (float(color_jitter),) * 3
+        secondary_tfl += [transforms.ColorJitter(*color_jitter)]
+
+    final_tfl = [RandomResizedCropAndInterpolationWithCoords(size=img_size, scale=scale, ratio=ratio, interpolation=interpolation)]
+
+    if use_prefetcher:
+        # prefetcher and collate will handle tensor conversion and norm
+        final_tfl += [ToNumpy()]
+    else:
+        final_tfl += [
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=torch.tensor(mean),
+                std=torch.tensor(std))
+        ]
+        if re_prob > 0.:
+            final_tfl.append(
+                RandomErasing(re_prob, mode=re_mode, max_count=re_count, num_splits=re_num_splits, device='cpu'))
+    return ComposeWithLabel(transforms=primary_tfl + secondary_tfl + final_tfl)
+
+
+def create_token_label_transform(
+        input_size,
+        is_training=False,
+        use_prefetcher=False,
+        no_aug=False,
+        scale=None,
+        ratio=None,
+        hflip=0.5,
+        vflip=0.,
+        color_jitter=0.4,
+        auto_augment=None,
+        interpolation='bilinear',
+        mean=IMAGENET_DEFAULT_MEAN,
+        std=IMAGENET_DEFAULT_STD,
+        re_prob=0.,
+        re_mode='const',
+        re_count=1,
+        re_num_splits=0,
+        crop_pct=None,
+        tf_preprocessing=False,
+        separate=False,):
+
+    if isinstance(input_size, tuple):
+        img_size = input_size[-2:]
+    else:
+        img_size = input_size
+
+    transform = transforms_imagenet_train(
+        img_size,
+        scale=scale,
+        ratio=ratio,
+        hflip=hflip,
+        vflip=vflip,
+        color_jitter=color_jitter,
+        auto_augment=auto_augment,
+        interpolation=interpolation,
+        use_prefetcher=use_prefetcher,
+        mean=mean,
+        std=std,
+        re_prob=re_prob,
+        re_mode=re_mode,
+        re_count=re_count,
+        re_num_splits=re_num_splits,
+        separate=separate)
+
+    return transform
diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/data/loader.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..70ff19759bf86a8342ac8291beed777f1ff151d3
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/loader.py
@@ -0,0 +1,393 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+""" Loader Factory, Fast Collate, CUDA Prefetcher
+Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/loader.py and modified for token labeling
+"""
+
+import torch.utils.data
+import numpy as np
+
+
+from .mixup import FastCollateTokenLabelMixup
+from .label_transforms_factory import create_token_label_transform
+
+from timm.data import create_transform
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from timm.data.distributed_sampler import OrderedDistributedSampler
+from timm.data.random_erasing import RandomErasing
+
+
+
+def fast_collate(batch):
+    """ A fast collation function optimized for uint8 images (np array or torch) and int64 targets (labels)"""
+    assert isinstance(batch[0], tuple)
+    batch_size = len(batch)
+    if isinstance(batch[0][0], tuple):
+        # This branch 'deinterleaves' and flattens tuples of input tensors into one tensor ordered by position
+        # such that all tuple of position n will end up in a torch.split(tensor, batch_size) in nth position
+        inner_tuple_size = len(batch[0][0])
+        flattened_batch_size = batch_size * inner_tuple_size
+        targets = torch.zeros(flattened_batch_size, dtype=torch.int64)
+        tensor = torch.zeros((flattened_batch_size, *batch[0][0][0].shape), dtype=torch.uint8)
+        for i in range(batch_size):
+            assert len(batch[i][0]) == inner_tuple_size  # all input tensor tuples must be same length
+            for j in range(inner_tuple_size):
+                targets[i + j * batch_size] = batch[i][1]
+                tensor[i + j * batch_size] += torch.from_numpy(batch[i][0][j])
+        return tensor, targets
+    elif isinstance(batch[0][0], np.ndarray):
+        if isinstance(batch[0][1], torch.Tensor):
+            targets = torch.stack([b[1] for b in batch])
+        else:
+            targets = torch.tensor([b[1] for b in batch], dtype=torch.int64)
+        assert len(targets) == batch_size
+        tensor = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8)
+        for i in range(batch_size):
+            tensor[i] += torch.from_numpy(batch[i][0])
+        return tensor, targets
+    elif isinstance(batch[0][0], torch.Tensor):
+        targets = torch.tensor([b[1] for b in batch], dtype=torch.int64)
+        assert len(targets) == batch_size
+        tensor = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8)
+        for i in range(batch_size):
+            tensor[i].copy_(batch[i][0])
+        return tensor, targets
+    else:
+        assert False
+
+
+class PrefetchLoader:
+
+    def __init__(self,
+                 loader,
+                 mean=IMAGENET_DEFAULT_MEAN,
+                 std=IMAGENET_DEFAULT_STD,
+                 fp16=False,
+                 re_prob=0.,
+                 re_mode='const',
+                 re_count=1,
+                 re_num_splits=0,
+                 device='cuda'):
+        self.loader = loader
+        self.mean = torch.tensor([x * 255 for x in mean]).npu().view(1, 3, 1, 1)
+        self.std = torch.tensor([x * 255 for x in std]).npu().view(1, 3, 1, 1)
+        self.fp16 = fp16
+        if fp16:
+            self.mean = self.mean.half()
+            self.std = self.std.half()
+        if re_prob > 0.:
+            self.random_erasing = RandomErasing(
+                probability=re_prob, mode=re_mode, max_count=re_count, num_splits=re_num_splits, device=device)
+        else:
+            self.random_erasing = None
+
+    def __iter__(self):
+        stream = torch.npu.Stream()
+        first = True
+
+        for next_input, next_target in self.loader:
+            with torch.npu.stream(stream):
+                next_input = next_input.npu(non_blocking=True)
+                next_target = next_target.npu(non_blocking=True)
+                if self.fp16:
+                    next_input = next_input.half().sub_(self.mean).div_(self.std)
+                else:
+                    next_input = next_input.float().sub_(self.mean).div_(self.std)
+                if self.random_erasing is not None:
+                    next_input = self.random_erasing(next_input)
+
+            if not first:
+                yield input, target
+            else:
+                first = False
+
+            torch.npu.current_stream().wait_stream(stream)
+            input = next_input
+            target = next_target
+
+        yield input, target
+
+    def __len__(self):
+        return len(self.loader)
+
+    @property
+    def sampler(self):
+        return self.loader.sampler
+
+    @property
+    def dataset(self):
+        return self.loader.dataset
+
+    @property
+    def mixup_enabled(self):
+        if isinstance(self.loader.collate_fn, FastCollateTokenLabelMixup):
+            return self.loader.collate_fn.mixup_enabled
+        else:
+            return False
+
+    @mixup_enabled.setter
+    def mixup_enabled(self, x):
+        if isinstance(self.loader.collate_fn, FastCollateTokenLabelMixup):
+            self.loader.collate_fn.mixup_enabled = x
+
+
+##############################
+# add argument device
+
+def create_loader(
+        device,
+        dataset,
+        input_size,
+        batch_size,
+        is_training=False,
+        use_prefetcher=True,
+        no_aug=False,
+        re_prob=0.,
+        re_mode='const',
+        re_count=1,
+        re_split=False,
+        scale=None,
+        ratio=None,
+        hflip=0.5,
+        vflip=0.,
+        color_jitter=0.4,
+        auto_augment=None,
+        num_aug_splits=0,
+        interpolation='bilinear',
+        mean=IMAGENET_DEFAULT_MEAN,
+        std=IMAGENET_DEFAULT_STD,
+        num_workers=1,
+        distributed=False,
+        crop_pct=None,
+        collate_fn=None,
+        pin_memory=False,
+        fp16=False,
+        tf_preprocessing=False,
+        use_multi_epochs_loader=False,
+        persistent_workers=True,
+):
+    re_num_splits = 0
+    if re_split:
+        # apply RE to second half of batch if no aug split otherwise line up with aug split
+        re_num_splits = num_aug_splits or 2
+    dataset.transform = create_transform(
+        input_size,
+        is_training=is_training,
+        use_prefetcher=use_prefetcher,
+        no_aug=no_aug,
+        scale=scale,
+        ratio=ratio,
+        hflip=hflip,
+        vflip=vflip,
+        color_jitter=color_jitter,
+        auto_augment=auto_augment,
+        interpolation=interpolation,
+        mean=mean,
+        std=std,
+        crop_pct=crop_pct,
+        tf_preprocessing=tf_preprocessing,
+        re_prob=re_prob,
+        re_mode=re_mode,
+        re_count=re_count,
+        re_num_splits=re_num_splits,
+        separate=num_aug_splits > 0,
+    )
+
+    sampler = None
+    if distributed and not isinstance(dataset, torch.utils.data.IterableDataset):
+        if is_training:
+            sampler = torch.utils.data.distributed.DistributedSampler(dataset)
+        else:
+            # This will add extra duplicate entries to result in equal num
+            # of samples per-process, will slightly alter validation results
+            sampler = OrderedDistributedSampler(dataset)
+
+    if collate_fn is None:
+        collate_fn = fast_collate if use_prefetcher else torch.utils.data.dataloader.default_collate
+
+    loader_class = torch.utils.data.DataLoader
+
+    if use_multi_epochs_loader:
+        loader_class = MultiEpochsDataLoader
+
+    loader_args = dict(
+        batch_size=batch_size,
+        shuffle=(sampler is None),
+        num_workers=num_workers,
+        sampler=sampler,
+        collate_fn=collate_fn,
+        pin_memory=pin_memory,
+        drop_last=is_training,
+        persistent_workers=persistent_workers)
+    try:
+        loader = loader_class(dataset, **loader_args)
+    except TypeError as e:
+        loader_args.pop('persistent_workers')  # only in Pytorch 1.7+
+        loader = loader_class(dataset, **loader_args)
+    if use_prefetcher:
+        prefetch_re_prob = re_prob if is_training and not no_aug else 0.
+        loader = PrefetchLoader(
+            loader,
+            mean=mean,
+            std=std,
+            fp16=fp16,
+            re_prob=prefetch_re_prob,
+            re_mode=re_mode,
+            re_count=re_count,
+            re_num_splits=re_num_splits,
+            device=device
+        )
+
+    return loader
+
+def create_token_label_loader(
+        device,
+        dataset,
+        input_size,
+        batch_size,
+        is_training=False,
+        use_prefetcher=True,
+        no_aug=False,
+        re_prob=0.,
+        re_mode='const',
+        re_count=1,
+        re_split=False,
+        scale=None,
+        ratio=None,
+        hflip=0.5,
+        vflip=0.,
+        color_jitter=0.4,
+        auto_augment=None,
+        num_aug_splits=0,
+        interpolation='bilinear',
+        mean=IMAGENET_DEFAULT_MEAN,
+        std=IMAGENET_DEFAULT_STD,
+        num_workers=1,
+        distributed=False,
+        crop_pct=None,
+        collate_fn=None,
+        pin_memory=False,
+        fp16=False,
+        tf_preprocessing=False,
+        use_multi_epochs_loader=False,
+        use_token_label=False,
+):
+    re_num_splits = 0
+    if re_split:
+        # apply RE to second half of batch if no aug split otherwise line up with aug split
+        re_num_splits = num_aug_splits or 2
+    if use_token_label:
+        transform_fn=create_token_label_transform
+    else:
+        transform_fn=create_transform
+    dataset.transform = transform_fn(
+        input_size,
+        is_training=is_training,
+        use_prefetcher=use_prefetcher,
+        no_aug=no_aug,
+        scale=scale,
+        ratio=ratio,
+        hflip=hflip,
+        vflip=vflip,
+        color_jitter=color_jitter,
+        auto_augment=auto_augment,
+        interpolation=interpolation,
+        mean=mean,
+        std=std,
+        crop_pct=crop_pct,
+        tf_preprocessing=tf_preprocessing,
+        re_prob=re_prob,
+        re_mode=re_mode,
+        re_count=re_count,
+        re_num_splits=re_num_splits,
+        separate=num_aug_splits > 0,
+    )
+
+    sampler = None
+    if distributed and not isinstance(dataset, torch.utils.data.IterableDataset):
+        if is_training:
+            sampler = torch.utils.data.distributed.DistributedSampler(dataset)
+        else:
+            # This will add extra duplicate entries to result in equal num
+            # of samples per-process, will slightly alter validation results
+            sampler = OrderedDistributedSampler(dataset)
+
+    if collate_fn is None:
+        collate_fn = fast_collate if use_prefetcher else torch.utils.data.dataloader.default_collate
+
+    loader_class = torch.utils.data.DataLoader
+
+    if use_multi_epochs_loader:
+        loader_class = MultiEpochsDataLoader
+
+    loader = loader_class(
+        dataset,
+        batch_size=batch_size,
+        shuffle=(sampler is None),
+        num_workers=num_workers,
+        sampler=sampler,
+        collate_fn=collate_fn,
+        pin_memory=pin_memory,
+        drop_last=is_training,
+    )
+    if use_prefetcher:
+        prefetch_re_prob = re_prob if is_training and not no_aug else 0.
+        loader = PrefetchLoader(
+            loader,
+            mean=mean,
+            std=std,
+            fp16=fp16,
+            re_prob=prefetch_re_prob,
+            re_mode=re_mode,
+            re_count=re_count,
+            re_num_splits=re_num_splits,
+            device=device
+        )
+
+    return loader
+
+
+class MultiEpochsDataLoader(torch.utils.data.DataLoader):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._DataLoader__initialized = False
+        self.batch_sampler = _RepeatSampler(self.batch_sampler)
+        self._DataLoader__initialized = True
+        self.iterator = super().__iter__()
+
+    def __len__(self):
+        return len(self.batch_sampler.sampler)
+
+    def __iter__(self):
+        for i in range(len(self)):
+            yield next(self.iterator)
+
+
+class _RepeatSampler(object):
+    """ Sampler that repeats forever.
+
+    Args:
+        sampler (Sampler)
+    """
+
+    def __init__(self, sampler):
+        self.sampler = sampler
+
+    def __iter__(self):
+        while True:
+            yield from iter(self.sampler)
diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/data/mixup.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/mixup.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb252c0698aa107ae609c22bad8a6a4828482702
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/mixup.py
@@ -0,0 +1,409 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+import numpy as np
+import torch
+from torchvision.ops import roi_align
+from torch.contrib.npu.optimized_lib import module as nnn
+
+def one_hot(x, num_classes, on_value=1., off_value=0., device='cuda'):
+    x = x.long().view(-1, 1)
+    return torch.full((x.size()[0], num_classes), off_value, device=device).scatter_(1, x, on_value)
+
+def get_featuremaps(label_maps_topk, num_classes, device='cuda'):
+    label_maps_topk_sizes = label_maps_topk[0].size()
+    label_maps = torch.full([label_maps_topk.size(0), num_classes, label_maps_topk_sizes[2],
+                              label_maps_topk_sizes[3]], 0, dtype=torch.float32 ,device=device)
+    for _label_map, _label_topk in zip(label_maps, label_maps_topk):
+        _label_map = _label_map.scatter_(
+            0,
+            _label_topk[1][:, :, :].long(),
+            _label_topk[0][:, :, :].float()
+        )
+    return label_maps
+
+def get_label(label_maps, batch_coords,label_size=1,device='cuda'):
+    '''
+    Adapted from https://github.com/naver-ai/relabel_imagenet/blob/main/utils/relabel_functions.py
+    Here we generate label for patch tokens and cls token separately and concat them together if given label_size>1
+    '''
+    num_batches = label_maps.size(0)
+    roialign1 = nnn.ROIAlign((label_size, label_size), 1.0, 2, False)
+    target_label = roialign1(label_maps, torch.cat(
+        [torch.arange(num_batches).view(num_batches,
+                                        1).float().to(device),
+         batch_coords.float() * label_maps.size(3) - 0.5], 1))
+
+    if label_size>1:
+        roialign2 = nnn.ROIAlign((1, 1), 1.0, 2, False)
+        target_label_cls = roialign2(label_maps, torch.cat(
+            [torch.arange(num_batches).view(num_batches,
+                                            1).float().to(device),
+             batch_coords.float() * label_maps.size(3) - 0.5], 1))
+        B,C,H,W = target_label.shape
+        target_label = target_label.view(B,C,H*W)
+        target_label = torch.cat([target_label_cls.view(B,C,1),target_label],dim=2)
+    target_label = torch.nn.functional.softmax(target_label.squeeze(), 1)
+    return target_label
+
+def get_labelmaps_with_coords(label_maps_topk, num_classes, on_value=1., off_value=0.,label_size=1, device='cuda'):
+    '''
+    Adapted from https://github.com/naver-ai/relabel_imagenet/blob/main/utils/relabel_functions.py
+    Generate the target label map for training from the given bbox and raw label map
+    '''
+    # trick to get coords_map from label_map
+    random_crop_coords = label_maps_topk[:,2,0,0,:4].view(-1, 4)
+    random_crop_coords[:, 2:] += random_crop_coords[:, :2]
+    random_crop_coords = random_crop_coords.to(device)
+
+    # trick to get ground truth from label_map
+    ground_truth = label_maps_topk[:,2,0,0,5].view(-1).to(dtype=torch.int64)
+    ground_truth = one_hot(ground_truth, num_classes, on_value=on_value, off_value=off_value, device=device)
+
+    # get full label maps from raw topk labels
+    label_maps = get_featuremaps(label_maps_topk=label_maps_topk,
+                               num_classes=num_classes,device=device)
+
+    # get token-level label and ground truth
+    token_label = get_label(label_maps=label_maps,
+                          batch_coords=random_crop_coords,
+                          label_size=label_size,
+                          device=device)
+    B,C = token_label.shape[:2]
+    token_label = token_label*on_value+off_value
+    if label_size==1:
+        return torch.cat([ground_truth.view(B,C,1),token_label.view(B,C,1)],dim=2)
+    else:
+        return torch.cat([ground_truth.view(B,C,1),token_label],dim=2)
+
+
+def mixup_target(target, num_classes, lam=1., smoothing=0.0, device='cuda', label_size=1):
+    '''
+    generate and mix target from the given label maps
+    target: label maps/ label maps with coords 
+    num_classes: number of classes for the target
+    lam: lambda for mixup target
+    ''' 
+    off_value = smoothing / num_classes
+    on_value = 1. - smoothing + off_value
+    if len(target.size())>2:
+        if target.size(1)==3:
+            y1 = get_labelmaps_with_coords(target, num_classes, on_value=on_value, off_value=off_value, device=device, label_size=label_size)
+            y2 = y1.flip(0)
+            # y2 = get_labelmaps_with_coords(target.flip(0), num_classes, on_value=on_value, off_value=off_value, device=device, label_size=label_size)
+        else:
+            raise ValueError("Not supported label type")
+    else:
+        y1 = one_hot(target, num_classes, on_value=on_value, off_value=off_value, device=device)
+        y2 = one_hot(target.flip(0), num_classes, on_value=on_value, off_value=off_value, device=device)
+
+    return y1 * lam + y2 * (1. - lam)
+
+
+def rand_bbox(img_shape, lam, margin=0., count=None):
+    """ Standard CutMix bounding-box
+    Generates a random square bbox based on lambda value. This impl includes
+    support for enforcing a border margin as percent of bbox dimensions.
+
+    Args:
+        img_shape (tuple): Image shape as tuple
+        lam (float): Cutmix lambda value
+        margin (float): Percentage of bbox dimension to enforce as margin (reduce amount of box outside image)
+        count (int): Number of bbox to generate
+    """
+    ratio = np.sqrt(1 - lam)
+    img_h, img_w = img_shape[-2:]
+    cut_h, cut_w = int(img_h * ratio), int(img_w * ratio)
+    margin_y, margin_x = int(margin * cut_h), int(margin * cut_w)
+    cy = np.random.randint(0 + margin_y, img_h - margin_y, size=count)
+    cx = np.random.randint(0 + margin_x, img_w - margin_x, size=count)
+    yl = np.clip(cy - cut_h // 2, 0, img_h)
+    yh = np.clip(cy + cut_h // 2, 0, img_h)
+    xl = np.clip(cx - cut_w // 2, 0, img_w)
+    xh = np.clip(cx + cut_w // 2, 0, img_w)
+    return yl, yh, xl, xh
+
+
+def rand_bbox_minmax(img_shape, minmax, count=None):
+    """ Min-Max CutMix bounding-box
+    Inspired by Darknet cutmix impl, generates a random rectangular bbox
+    based on min/max percent values applied to each dimension of the input image.
+
+    Typical defaults for minmax are usually in the  .2-.3 for min and .8-.9 range for max.
+
+    Args:
+        img_shape (tuple): Image shape as tuple
+        minmax (tuple or list): Min and max bbox ratios (as percent of image size)
+        count (int): Number of bbox to generate
+    """
+    assert len(minmax) == 2
+    img_h, img_w = img_shape[-2:]
+    cut_h = np.random.randint(int(img_h * minmax[0]), int(img_h * minmax[1]), size=count)
+    cut_w = np.random.randint(int(img_w * minmax[0]), int(img_w * minmax[1]), size=count)
+    yl = np.random.randint(0, img_h - cut_h, size=count)
+    xl = np.random.randint(0, img_w - cut_w, size=count)
+    yu = yl + cut_h
+    xu = xl + cut_w
+    return yl, yu, xl, xu
+
+
+def cutmix_bbox_and_lam(img_shape, lam, ratio_minmax=None, correct_lam=True, count=None):
+    """ Generate bbox and apply lambda correction.
+    """
+    if ratio_minmax is not None:
+        yl, yu, xl, xu = rand_bbox_minmax(img_shape, ratio_minmax, count=count)
+    else:
+        yl, yu, xl, xu = rand_bbox(img_shape, lam, count=count)
+    if correct_lam or ratio_minmax is not None:
+        bbox_area = (yu - yl) * (xu - xl)
+        lam = 1. - bbox_area / float(img_shape[-2] * img_shape[-1])
+    return (yl, yu, xl, xu), lam
+
+
+class TokenLabelMixup:
+    """ Mixup/Cutmix with label that applies different params to each element or whole batch
+    Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/mixup.py
+
+    Args:
+        mixup_alpha (float): mixup alpha value, mixup is active if > 0.
+        cutmix_alpha (float): cutmix alpha value, cutmix is active if > 0.
+        cutmix_minmax (List[float]): cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.
+        prob (float): probability of applying mixup or cutmix per batch or element
+        switch_prob (float): probability of switching to cutmix instead of mixup when both are active
+        mode (str): how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
+        correct_lam (bool): apply lambda correction when cutmix bbox clipped by image borders
+        label_smoothing (float): apply label smoothing to the mixed target tensor
+        num_classes (int): number of classes for target
+        label_size (int): target label size
+    """
+    def __init__(self, mixup_alpha=1., cutmix_alpha=0., cutmix_minmax=None, prob=1.0, switch_prob=0.5,
+                 mode='batch', correct_lam=True, label_smoothing=0.1, num_classes=1000, label_size=1):
+        self.mixup_alpha = mixup_alpha
+        self.cutmix_alpha = cutmix_alpha
+        self.cutmix_minmax = cutmix_minmax
+        if self.cutmix_minmax is not None:
+            assert len(self.cutmix_minmax) == 2
+            # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe
+            self.cutmix_alpha = 1.0
+        self.mix_prob = prob
+        self.switch_prob = switch_prob
+        self.label_smoothing = label_smoothing
+        self.num_classes = num_classes
+        self.mode = mode
+        self.correct_lam = correct_lam  # correct lambda based on clipped area for cutmix
+        self.mixup_enabled = True  # set to false to disable mixing (intended tp be set by train loop)
+        self.label_size=label_size
+
+    def _params_per_elem(self, batch_size):
+        lam = np.ones(batch_size, dtype=np.float32)
+        use_cutmix = np.zeros(batch_size, dtype=np.bool)
+        if self.mixup_enabled:
+            if self.mixup_alpha > 0. and self.cutmix_alpha > 0.:
+                use_cutmix = np.random.rand(batch_size) < self.switch_prob
+                lam_mix = np.where(
+                    use_cutmix,
+                    np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size),
+                    np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size))
+            elif self.mixup_alpha > 0.:
+                lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size)
+            elif self.cutmix_alpha > 0.:
+                use_cutmix = np.ones(batch_size, dtype=np.bool)
+                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size)
+            else:
+                assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true."
+            lam = np.where(np.random.rand(batch_size) < self.mix_prob, lam_mix.astype(np.float32), lam)
+        return lam, use_cutmix
+
+    def _params_per_batch(self):
+        lam = 1.
+        use_cutmix = False
+        if self.mixup_enabled and np.random.rand() < self.mix_prob:
+            if self.mixup_alpha > 0. and self.cutmix_alpha > 0.:
+                use_cutmix = np.random.rand() < self.switch_prob
+                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) if use_cutmix else \
+                    np.random.beta(self.mixup_alpha, self.mixup_alpha)
+            elif self.mixup_alpha > 0.:
+                lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha)
+            elif self.cutmix_alpha > 0.:
+                use_cutmix = True
+                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha)
+            else:
+                assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true."
+            lam = float(lam_mix)
+        return lam, use_cutmix
+
+    def _mix_elem(self, x):
+        batch_size = len(x)
+        lam_batch, use_cutmix = self._params_per_elem(batch_size)
+        x_orig = x.clone()  # need to keep an unmodified original for mixing source
+        for i in range(batch_size):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            if lam != 1.:
+                if use_cutmix[i]:
+                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                        x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+                    x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh]
+                    lam_batch[i] = lam
+                else:
+                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
+        return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1)
+
+    def _mix_pair(self, x):
+        batch_size = len(x)
+        lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
+        x_orig = x.clone()  # need to keep an unmodified original for mixing source
+        for i in range(batch_size // 2):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            if lam != 1.:
+                if use_cutmix[i]:
+                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                        x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+                    x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh]
+                    x[j][:, yl:yh, xl:xh] = x_orig[i][:, yl:yh, xl:xh]
+                    lam_batch[i] = lam
+                else:
+                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
+                    x[j] = x[j] * lam + x_orig[i] * (1 - lam)
+        lam_batch = np.concatenate((lam_batch, lam_batch[::-1]))
+        return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1)
+
+    def _mix_batch(self, x):
+        lam, use_cutmix = self._params_per_batch()
+        if lam == 1.:
+            return 1.
+        if use_cutmix:
+            (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                x.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+            x[:, :, yl:yh, xl:xh] = x.flip(0)[:, :, yl:yh, xl:xh]
+        else:
+            x_flipped = x.flip(0).mul_(1. - lam)
+            x.mul_(lam).add_(x_flipped)
+        return lam
+
+    def __call__(self, x, target):
+        assert len(x) % 2 == 0, 'Batch size should be even when using this'
+        if self.mode == 'elem':
+            lam = self._mix_elem(x)
+        elif self.mode == 'pair':
+            lam = self._mix_pair(x)
+        else:
+            lam = self._mix_batch(x)
+        target = mixup_target(target, self.num_classes, lam, self.label_smoothing, label_size=self.label_size)
+        return x, target
+
+
+class FastCollateTokenLabelMixup(TokenLabelMixup):
+    """ Fast Collate w/ Mixup/Cutmix with label that applies different params to each element or whole batch
+    Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/mixup.py
+
+    A Mixup impl that's performed while collating the batches.
+    """
+
+    def _mix_elem_collate(self, output, batch, half=False):
+        batch_size = len(batch)
+        num_elem = batch_size // 2 if half else batch_size
+        assert len(output) == num_elem
+        lam_batch, use_cutmix = self._params_per_elem(num_elem)
+        for i in range(num_elem):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            mixed = batch[i][0]
+            if lam != 1.:
+                if use_cutmix[i]:
+                    if not half:
+                        mixed = mixed.copy()
+                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                        output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+                    mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh]
+                    lam_batch[i] = lam
+                else:
+                    mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam)
+                    np.rint(mixed, out=mixed)
+            output[i] += torch.from_numpy(mixed.astype(np.uint8))
+        if half:
+            lam_batch = np.concatenate((lam_batch, np.ones(num_elem)))
+        return torch.tensor(lam_batch).unsqueeze(1)
+
+    def _mix_pair_collate(self, output, batch):
+        batch_size = len(batch)
+        lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
+        for i in range(batch_size // 2):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            mixed_i = batch[i][0]
+            mixed_j = batch[j][0]
+            assert 0 <= lam <= 1.0
+            if lam < 1.:
+                if use_cutmix[i]:
+                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                        output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+                    patch_i = mixed_i[:, yl:yh, xl:xh].copy()
+                    mixed_i[:, yl:yh, xl:xh] = mixed_j[:, yl:yh, xl:xh]
+                    mixed_j[:, yl:yh, xl:xh] = patch_i
+                    lam_batch[i] = lam
+                else:
+                    mixed_temp = mixed_i.astype(np.float32) * lam + mixed_j.astype(np.float32) * (1 - lam)
+                    mixed_j = mixed_j.astype(np.float32) * lam + mixed_i.astype(np.float32) * (1 - lam)
+                    mixed_i = mixed_temp
+                    np.rint(mixed_j, out=mixed_j)
+                    np.rint(mixed_i, out=mixed_i)
+            output[i] += torch.from_numpy(mixed_i.astype(np.uint8))
+            output[j] += torch.from_numpy(mixed_j.astype(np.uint8))
+        lam_batch = np.concatenate((lam_batch, lam_batch[::-1]))
+        return torch.tensor(lam_batch).unsqueeze(1)
+
+    def _mix_batch_collate(self, output, batch):
+        batch_size = len(batch)
+        lam, use_cutmix = self._params_per_batch()
+        if use_cutmix:
+            (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+        for i in range(batch_size):
+            j = batch_size - i - 1
+            mixed = batch[i][0]
+            if lam != 1.:
+                if use_cutmix:
+                    mixed = mixed.copy()  # don't want to modify the original while iterating
+                    mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh]
+                else:
+                    mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam)
+                    np.rint(mixed, out=mixed)
+            output[i] += torch.from_numpy(mixed.astype(np.uint8))
+        return lam
+
+    def __call__(self, batch, _=None):
+        batch_size = len(batch)
+        assert batch_size % 2 == 0, 'Batch size should be even when using this'
+        half = 'half' in self.mode
+        if half:
+            batch_size //= 2
+        output = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8)
+        if self.mode == 'elem' or self.mode == 'half':
+            lam = self._mix_elem_collate(output, batch, half=half)
+        elif self.mode == 'pair':
+            lam = self._mix_pair_collate(output, batch)
+        else:
+            lam = self._mix_batch_collate(output, batch)
+
+        if type(batch[0][1])==type(0):
+            target = torch.tensor([b[1] for b in batch], dtype=torch.int64)
+        else:
+            target = torch.stack([b[1] for b in batch],0)
+        target = mixup_target(target, self.num_classes, lam, self.label_smoothing, device='cpu',label_size=self.label_size)
+        target = target[:batch_size]
+        return output, target
diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/data/random_augment_label.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/random_augment_label.py
new file mode 100644
index 0000000000000000000000000000000000000000..765cf61048f7f20ee50534d4d98d5139fff53013
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/data/random_augment_label.py
@@ -0,0 +1,576 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+""" 
+Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/auto_augment.py and modified for token labeling
+AutoAugment, RandAugment
+"""
+import random
+import math
+import re
+from PIL import Image, ImageOps, ImageEnhance, ImageChops
+import PIL
+import numpy as np
+from scipy import ndimage
+import torch
+
+_PIL_VER = tuple([int(x) for x in PIL.__version__.split('.')[:2]])
+
+_FILL = (128, 128, 128)
+
+
+
+# This signifies the max integer that the controller RNN could predict for the
+# augmentation scheme.
+_MAX_LEVEL = 10.
+
+_HPARAMS_DEFAULT = dict(
+    translate_const=250,
+    img_mean=_FILL,
+)
+
+_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC)
+
+
+def _interpolation(kwargs):
+    interpolation = kwargs.pop('resample', Image.BILINEAR)
+    if isinstance(interpolation, (list, tuple)):
+        return random.choice(interpolation)
+    else:
+        return interpolation
+
+def affine_label(label, matrix):
+    
+    # label: 2, k, H, W
+    # label[0] value, label[1] index
+    a,b,c,d,e,f = matrix
+    affine_matrix = [[1,0,0,0],[0,a,b,c],[0,d,e,f]]
+    value = ndimage.affine_transform(label[0],matrix=affine_matrix, order=0, mode="constant")
+    index = ndimage.affine_transform(label[1],matrix=affine_matrix, order=0, mode="nearest")
+
+    return torch.from_numpy(np.stack([value, index],axis=0))
+
+def _check_args_tf(kwargs):
+    if 'fillcolor' in kwargs and _PIL_VER < (5, 0):
+        kwargs.pop('fillcolor')
+    kwargs['resample'] = _interpolation(kwargs)
+
+
+def shear_x(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), **kwargs)
+
+def shear_y_label(label, factor):
+    return affine_label(label, (1, factor, 0, 0, 1, 0))
+
+
+def shear_y(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), **kwargs)
+
+def shear_x_label(label, factor):
+    return affine_label(label, (1, 0, 0, factor, 1, 0))
+
+def translate_x_rel(img, pct, **kwargs):
+    pixels = pct * img.size[0]
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs)
+
+def translate_y_rel_label(label, pct):
+    pixels = pct * label.size(2)
+    return affine_label(label, (1, 0, pixels, 0, 1, 0))
+
+
+def translate_y_rel(img, pct, **kwargs):
+    pixels = pct * img.size[1]
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs)
+
+def translate_x_rel_label(label, pct):
+    pixels = pct * label.size(3)
+    return affine_label(label, (1, 0, 0, 0, 1, pixels))
+
+
+def translate_x_abs(img, pixels, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs)
+
+
+def translate_y_abs(img, pixels, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs)
+
+
+def rotate(img, degrees, **kwargs):
+    _check_args_tf(kwargs)
+    if _PIL_VER >= (5, 2):
+        return img.rotate(degrees, **kwargs)
+    elif _PIL_VER >= (5, 0):
+        w, h = img.size
+        post_trans = (0, 0)
+        rotn_center = (w / 2.0, h / 2.0)
+        angle = -math.radians(degrees)
+        matrix = [
+            round(math.cos(angle), 15),
+            round(math.sin(angle), 15),
+            0.0,
+            round(-math.sin(angle), 15),
+            round(math.cos(angle), 15),
+            0.0,
+        ]
+
+        def transform(x, y, matrix):
+            (a, b, c, d, e, f) = matrix
+            return a * x + b * y + c, d * x + e * y + f
+
+        matrix[2], matrix[5] = transform(
+            -rotn_center[0] - post_trans[0], -rotn_center[1] - post_trans[1], matrix
+        )
+        matrix[2] += rotn_center[0]
+        matrix[5] += rotn_center[1]
+        return img.transform(img.size, Image.AFFINE, matrix, **kwargs)
+    else:
+        return img.rotate(degrees, resample=kwargs['resample'])
+
+def rotate_label(label, degrees):
+    _,_, w, h = label.size()
+    post_trans = (0, 0)
+    rotn_center = (w / 2.0, h / 2.0)
+    angle = math.radians(degrees)
+    matrix = [
+        round(math.cos(angle), 15),
+        round(math.sin(angle), 15),
+        0.0,
+        round(-math.sin(angle), 15),
+        round(math.cos(angle), 15),
+        0.0,
+    ]
+
+    def transform(x, y, matrix):
+        (a, b, c, d, e, f) = matrix
+        return a * x + b * y + c, d * x + e * y + f
+
+    matrix[2], matrix[5] = transform(
+        -rotn_center[0] - post_trans[0], -rotn_center[1] - post_trans[1], matrix
+    )
+    matrix[2] += rotn_center[0]
+    matrix[5] += rotn_center[1]
+    return affine_label(label, matrix)
+
+
+def auto_contrast(img, **__):
+    return ImageOps.autocontrast(img)
+
+
+def invert(img, **__):
+    return ImageOps.invert(img)
+
+
+def equalize(img, **__):
+    return ImageOps.equalize(img)
+
+
+def solarize(img, thresh, **__):
+    return ImageOps.solarize(img, thresh)
+
+
+def solarize_add(img, add, thresh=128, **__):
+    lut = []
+    for i in range(256):
+        if i < thresh:
+            lut.append(min(255, i + add))
+        else:
+            lut.append(i)
+    if img.mode in ("L", "RGB"):
+        if img.mode == "RGB" and len(lut) == 256:
+            lut = lut + lut + lut
+        return img.point(lut)
+    else:
+        return img
+
+
+def posterize(img, bits_to_keep, **__):
+    if bits_to_keep >= 8:
+        return img
+    return ImageOps.posterize(img, bits_to_keep)
+
+
+def contrast(img, factor, **__):
+    return ImageEnhance.Contrast(img).enhance(factor)
+
+
+def color(img, factor, **__):
+    return ImageEnhance.Color(img).enhance(factor)
+
+
+def brightness(img, factor, **__):
+    return ImageEnhance.Brightness(img).enhance(factor)
+
+
+def sharpness(img, factor, **__):
+    return ImageEnhance.Sharpness(img).enhance(factor)
+
+
+def _randomly_negate(v):
+    """With 50% prob, negate the value"""
+    return -v if random.random() > 0.5 else v
+
+
+def _rotate_level_to_arg(level, _hparams):
+    # range [-30, 30]
+    level = (level / _MAX_LEVEL) * 30.
+    level = _randomly_negate(level)
+    return level,
+
+
+def _enhance_level_to_arg(level, _hparams):
+    # range [0.1, 1.9]
+    return (level / _MAX_LEVEL) * 1.8 + 0.1,
+
+
+def _enhance_increasing_level_to_arg(level, _hparams):
+    # the 'no change' level is 1.0, moving away from that towards 0. or 2.0 increases the enhancement blend
+    # range [0.1, 1.9]
+    level = (level / _MAX_LEVEL) * .9
+    level = 1.0 + _randomly_negate(level)
+    return level,
+
+
+def _shear_level_to_arg(level, _hparams):
+    # range [-0.3, 0.3]
+    level = (level / _MAX_LEVEL) * 0.3
+    level = _randomly_negate(level)
+    return level,
+
+
+def _translate_abs_level_to_arg(level, hparams):
+    translate_const = hparams['translate_const']
+    level = (level / _MAX_LEVEL) * float(translate_const)
+    level = _randomly_negate(level)
+    return level,
+
+
+def _translate_rel_level_to_arg(level, hparams):
+    # default range [-0.45, 0.45]
+    translate_pct = hparams.get('translate_pct', 0.45)
+    level = (level / _MAX_LEVEL) * translate_pct
+    level = _randomly_negate(level)
+    return level,
+
+
+def _posterize_level_to_arg(level, _hparams):
+    # As per Tensorflow TPU EfficientNet impl
+    # range [0, 4], 'keep 0 up to 4 MSB of original image'
+    # intensity/severity of augmentation decreases with level
+    return int((level / _MAX_LEVEL) * 4),
+
+
+def _posterize_increasing_level_to_arg(level, hparams):
+    # As per Tensorflow models research and UDA impl
+    # range [4, 0], 'keep 4 down to 0 MSB of original image',
+    # intensity/severity of augmentation increases with level
+    return 4 - _posterize_level_to_arg(level, hparams)[0],
+
+
+def _posterize_original_level_to_arg(level, _hparams):
+    # As per original AutoAugment paper description
+    # range [4, 8], 'keep 4 up to 8 MSB of image'
+    # intensity/severity of augmentation decreases with level
+    return int((level / _MAX_LEVEL) * 4) + 4,
+
+
+def _solarize_level_to_arg(level, _hparams):
+    # range [0, 256]
+    # intensity/severity of augmentation decreases with level
+    return int((level / _MAX_LEVEL) * 256),
+
+
+def _solarize_increasing_level_to_arg(level, _hparams):
+    # range [0, 256]
+    # intensity/severity of augmentation increases with level
+    return 256 - _solarize_level_to_arg(level, _hparams)[0],
+
+
+def _solarize_add_level_to_arg(level, _hparams):
+    # range [0, 110]
+    return int((level / _MAX_LEVEL) * 110),
+
+class AugmentOp:
+
+    def __init__(self, name, prob=0.5, magnitude=10, hparams=None):
+        hparams = hparams or _HPARAMS_DEFAULT
+        self.name = name
+        self.aug_fn = NAME_TO_OP[name]
+        self.label_fn = NAME_TO_LABELOP[name]
+        self.level_fn = LEVEL_TO_ARG[name]
+        self.prob = prob
+        self.magnitude = magnitude
+        self.hparams = hparams.copy()
+        self.kwargs = dict(
+            fillcolor=hparams['img_mean'] if 'img_mean' in hparams else _FILL,
+            resample=hparams['interpolation'] if 'interpolation' in hparams else _RANDOM_INTERPOLATION,
+        )
+
+        # If magnitude_std is > 0, we introduce some randomness
+        # in the usually fixed policy and sample magnitude from a normal distribution
+        # with mean `magnitude` and std-dev of `magnitude_std`.
+        # NOTE This is my own hack, being tested, not in papers or reference impls.
+        self.magnitude_std = self.hparams.get('magnitude_std', 0)
+
+    def __call__(self, img, label):
+        if self.prob < 1.0 and random.random() > self.prob:
+            return img, label
+        magnitude = self.magnitude
+        if self.magnitude_std and self.magnitude_std > 0:
+            magnitude = random.gauss(magnitude, self.magnitude_std)
+        magnitude = min(_MAX_LEVEL, max(0, magnitude))  # clip to valid range
+        level_args = self.level_fn(magnitude, self.hparams) if self.level_fn is not None else tuple()
+        if self.label_fn is not None:
+
+            aug_label = self.label_fn(label, *level_args)
+        else:
+            aug_label = label
+        return self.aug_fn(img, *level_args, **self.kwargs), aug_label
+
+LEVEL_TO_ARG = {
+    'AutoContrast': None,
+    'Equalize': None,
+    'Invert': None,
+    'Rotate': _rotate_level_to_arg,
+    # There are several variations of the posterize level scaling in various Tensorflow/Google repositories/papers
+    'Posterize': _posterize_level_to_arg,
+    'PosterizeIncreasing': _posterize_increasing_level_to_arg,
+    'PosterizeOriginal': _posterize_original_level_to_arg,
+    'Solarize': _solarize_level_to_arg,
+    'SolarizeIncreasing': _solarize_increasing_level_to_arg,
+    'SolarizeAdd': _solarize_add_level_to_arg,
+    'Color': _enhance_level_to_arg,
+    'ColorIncreasing': _enhance_increasing_level_to_arg,
+    'Contrast': _enhance_level_to_arg,
+    'ContrastIncreasing': _enhance_increasing_level_to_arg,
+    'Brightness': _enhance_level_to_arg,
+    'BrightnessIncreasing': _enhance_increasing_level_to_arg,
+    'Sharpness': _enhance_level_to_arg,
+    'SharpnessIncreasing': _enhance_increasing_level_to_arg,
+    'ShearX': _shear_level_to_arg,
+    'ShearY': _shear_level_to_arg,
+    'TranslateX': _translate_abs_level_to_arg,
+    'TranslateY': _translate_abs_level_to_arg,
+    'TranslateXRel': _translate_rel_level_to_arg,
+    'TranslateYRel': _translate_rel_level_to_arg,
+}
+
+
+NAME_TO_OP = {
+    'AutoContrast': auto_contrast,
+    'Equalize': equalize,
+    'Invert': invert,
+    'Rotate': rotate,
+    'Posterize': posterize,
+    'PosterizeIncreasing': posterize,
+    'PosterizeOriginal': posterize,
+    'Solarize': solarize,
+    'SolarizeIncreasing': solarize,
+    'SolarizeAdd': solarize_add,
+    'Color': color,
+    'ColorIncreasing': color,
+    'Contrast': contrast,
+    'ContrastIncreasing': contrast,
+    'Brightness': brightness,
+    'BrightnessIncreasing': brightness,
+    'Sharpness': sharpness,
+    'SharpnessIncreasing': sharpness,
+    'ShearX': shear_x,
+    'ShearY': shear_y,
+    'TranslateX': translate_x_abs,
+    'TranslateY': translate_y_abs,
+    'TranslateXRel': translate_x_rel,
+    'TranslateYRel': translate_y_rel,
+}
+# Remove TranslateX and TranslateY here since it is actually not used in random aug
+# Only spatial op should be applied to the label map
+NAME_TO_LABELOP = {
+    'AutoContrast': None,
+    'Equalize': None,
+    'Invert': None,
+    'Rotate': rotate_label,
+    'Posterize': None,
+    'PosterizeIncreasing': None,
+    'PosterizeOriginal': None,
+    'Solarize': None,
+    'SolarizeIncreasing': None,
+    'SolarizeAdd': None,
+    'Color': None,
+    'ColorIncreasing': None,
+    'Contrast': None,
+    'ContrastIncreasing': None,
+    'Brightness': None,
+    'BrightnessIncreasing': None,
+    'Sharpness': None,
+    'SharpnessIncreasing': None,
+    'ShearX': shear_x_label,
+    'ShearY': shear_y_label,
+    'TranslateX': None,
+    'TranslateY': None,
+    'TranslateXRel': translate_x_rel_label,
+    'TranslateYRel': translate_y_rel_label,
+}
+
+
+_RAND_TRANSFORMS = [
+    'AutoContrast',
+    'Equalize',
+    'Invert',
+    'Rotate',
+    'Posterize',
+    'Solarize',
+    'SolarizeAdd',
+    'Color',
+    'Contrast',
+    'Brightness',
+    'Sharpness',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+    #'Cutout' 
+]
+
+
+_RAND_INCREASING_TRANSFORMS = [
+    'AutoContrast',
+    'Equalize',
+    'Invert',
+    'Rotate',
+    'PosterizeIncreasing',
+    'SolarizeIncreasing',
+    'SolarizeAdd',
+    'ColorIncreasing',
+    'ContrastIncreasing',
+    'BrightnessIncreasing',
+    'SharpnessIncreasing',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+    #'Cutout'
+]
+
+
+
+# These experimental weights are based loosely on the relative improvements mentioned in paper.
+# They may not result in increased performance, but could likely be tuned to so.
+_RAND_CHOICE_WEIGHTS_0 = {
+    'Rotate': 0.3,
+    'ShearX': 0.2,
+    'ShearY': 0.2,
+    'TranslateXRel': 0.1,
+    'TranslateYRel': 0.1,
+    'Color': .025,
+    'Sharpness': 0.025,
+    'AutoContrast': 0.025,
+    'Solarize': .005,
+    'SolarizeAdd': .005,
+    'Contrast': .005,
+    'Brightness': .005,
+    'Equalize': .005,
+    'Posterize': 0,
+    'Invert': 0,
+}
+
+
+def _select_rand_weights(weight_idx=0, transforms=None):
+    transforms = transforms or _RAND_TRANSFORMS
+    assert weight_idx == 0  # only one set of weights currently
+    rand_weights = _RAND_CHOICE_WEIGHTS_0
+    probs = [rand_weights[k] for k in transforms]
+    probs /= np.sum(probs)
+    return probs
+
+
+def rand_augment_ops(magnitude=10, hparams=None, transforms=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    transforms = transforms or _RAND_TRANSFORMS
+    return [AugmentOp(
+        name, prob=0.5, magnitude=magnitude, hparams=hparams) for name in transforms]
+
+
+class RandAugment:
+    '''
+    Apply RandAug on both image and dense label map
+    '''
+    def __init__(self, ops, num_layers=2, choice_weights=None):
+        self.ops = ops
+        self.num_layers = num_layers
+        self.choice_weights = choice_weights
+
+    def __call__(self, img, label):
+        # no replacement when using weighted choice
+        ops = np.random.choice(
+            self.ops, self.num_layers, replace=self.choice_weights is None, p=self.choice_weights)
+        for op in ops:
+            img, label = op(img, label)
+        return img, label
+
+
+def rand_augment_transform(config_str, hparams):
+    """
+    Create a RandAugment transform with label
+    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
+    sections, not order sepecific determine
+        'm' - integer magnitude of rand augment
+        'n' - integer num layers (number of transform ops selected per image)
+        'w' - integer probabiliy weight index (index of a set of weights to influence choice of op)
+        'mstd' -  float std deviation of magnitude noise applied
+        'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0)
+    Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5
+    'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2
+
+    :param hparams: Other hparams (kwargs) for the RandAugmentation scheme
+
+    :return: A PyTorch compatible Transform
+    """
+    magnitude = _MAX_LEVEL  # default to _MAX_LEVEL for magnitude (currently 10)
+    num_layers = 2  # default to 2 ops per image
+    weight_idx = None  # default to no probability weights for op choice
+    transforms = _RAND_TRANSFORMS
+    config = config_str.split('-')
+    assert config[0] == 'rand'
+    config = config[1:]
+    for c in config:
+        cs = re.split(r'(\d.*)', c)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == 'mstd':
+            # noise param injected via hparams for now
+            hparams.setdefault('magnitude_std', float(val))
+        elif key == 'inc':
+            if bool(val):
+                transforms = _RAND_INCREASING_TRANSFORMS
+        elif key == 'm':
+            magnitude = int(val)
+        elif key == 'n':
+            num_layers = int(val)
+        elif key == 'w':
+            weight_idx = int(val)
+        else:
+            assert False, 'Unknown RandAugment config section'
+    ra_ops = rand_augment_ops(magnitude=magnitude, hparams=hparams, transforms=transforms)
+    choice_weights = None if weight_idx is None else _select_rand_weights(weight_idx)
+    return RandAugment(ra_ops, num_layers, choice_weights=choice_weights)
+
diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/loss/__init__.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/loss/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1544c017967c2101d374786271ccd15ae542b02b
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/loss/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+from .cross_entropy import TokenLabelCrossEntropy,TokenLabelSoftTargetCrossEntropy
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/loss/cross_entropy.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/loss/cross_entropy.py
new file mode 100644
index 0000000000000000000000000000000000000000..5615b9a0cb07d29306b933436d463894fc98c6c0
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/loss/cross_entropy.py
@@ -0,0 +1,99 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+class SoftTargetCrossEntropy(nn.Module):
+
+    def __init__(self):
+        super(SoftTargetCrossEntropy, self).__init__()
+
+    def forward(self, x, target):
+        N_rep = x.shape[0]
+        N = target.shape[0]
+        if not N==N_rep:
+            target = target.repeat(N_rep//N,1)
+        loss = torch.sum(-target * F.log_softmax(x, dim=-1), dim=-1)
+        return loss.mean()
+
+class TokenLabelSoftTargetCrossEntropy(nn.Module):
+
+    def __init__(self):
+        super(TokenLabelSoftTargetCrossEntropy, self).__init__()
+
+    def forward(self, x, target):
+        N_rep = x.shape[0]
+        N = target.shape[0]
+        if not N==N_rep:
+            target = target.repeat(N_rep//N,1)
+        if len(target.shape)==3 and target.shape[-1]==2:
+            ground_truth=target[:,:,0]
+            target = target[:,:,1]
+        loss = torch.sum(-target * F.log_softmax(x, dim=-1), dim=-1)
+        return loss.mean()
+
+class TokenLabelCrossEntropy(nn.Module):
+    """
+    Token labeling loss.
+    """
+    def __init__(self, dense_weight=1.0, cls_weight = 1.0, mixup_active=True, classes = 1000, ground_truth = False):
+        """
+        Constructor Token labeling loss.
+        """
+        super(TokenLabelCrossEntropy, self).__init__()
+
+
+        self.CE = SoftTargetCrossEntropy()
+
+        self.dense_weight = dense_weight
+        self.mixup_active = mixup_active
+        self.classes = classes
+        self.cls_weight = cls_weight
+        self.ground_truth = ground_truth
+        assert dense_weight+cls_weight>0
+
+
+    def forward(self, x, target):
+
+        output, aux_output, bb = x
+        bbx1, bby1, bbx2, bby2 = bb
+
+        B,N,C = aux_output.shape
+        if len(target.shape)==2:
+            target_cls=target
+            target_aux = target.repeat(1,N).reshape(B*N,C)
+        else: 
+            target_cls = target[:,:,1]
+            if self.ground_truth:
+                # use ground truth to help correct label.
+                # rely more on ground truth if target_cls is incorrect.
+                ground_truth = target[:,:,0]
+                ratio = (0.9 - 0.4 * (ground_truth.max(-1)[1] == target_cls.max(-1)[1])).unsqueeze(-1)
+                target_cls = target_cls * ratio + ground_truth * (1 - ratio)
+            target_aux = target[:,:,2:]
+            target_aux = target_aux.transpose(1,2).reshape(-1,C)
+        lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / N)
+        if lam<1:
+            target_cls = lam*target_cls + (1-lam)*target_cls.flip(0)
+
+        aux_output = aux_output.reshape(-1,C)
+        loss_cls = self.CE(output, target_cls)
+        loss_aux = self.CE(aux_output, target_aux)
+        return self.cls_weight*loss_cls+self.dense_weight* loss_aux
+
diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/models/__init__.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..61b4a019c51f7dc97e0728b4ee2519a393e275f5
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/models/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+from .lvvit import *
diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/models/layers.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/models/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..42221bac677ed1e6c2ce7fe16e003d51bdfca536
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/models/layers.py
@@ -0,0 +1,437 @@
+
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+import torch
+import torch.nn as nn
+import numpy as np
+from functools import partial
+import torch.nn.init as init
+import torch.nn.functional as F
+import math
+from timm.models.layers import DropPath, to_2tuple
+
+DROPOUT_FLOPS = 4
+LAYER_NORM_FLOPS = 5
+ACTIVATION_FLOPS = 8
+SOFTMAX_FLOPS = 5
+
+class GroupLinear(nn.Module):
+    '''
+    Group Linear operator 
+    '''
+    def __init__(self, in_planes, out_channels,groups=1, bias=True):
+        super(GroupLinear, self).__init__()
+        assert in_planes%groups==0
+        assert out_channels%groups==0
+        self.in_dim = in_planes
+        self.out_dim = out_channels
+        self.groups=groups
+        self.bias = bias
+        self.group_in_dim = int(self.in_dim/self.groups)
+        self.group_out_dim = int(self.out_dim/self.groups)
+
+        self.group_weight = nn.Parameter(torch.zeros(self.groups, self.group_in_dim, self.group_out_dim))
+        self.group_bias=nn.Parameter(torch.zeros(self.out_dim))
+
+    def forward(self, x):
+        t,b,d=x.size()
+        x = x.view(t,b,self.groups,int(d/self.groups))
+        out = torch.einsum('tbgd,gdf->tbgf', (x, self.group_weight)).reshape(t,b,self.out_dim)+self.group_bias
+        return out
+    def extra_repr(self):
+        s = ('{in_dim}, {out_dim}')
+        if self.groups != 1:
+            s += ', groups={groups}'
+        if self.bias is None:
+            s += ', bias=False'
+        return s.format(**self.__dict__)
+
+
+class Mlp(nn.Module):
+    '''
+    MLP with support to use group linear operator
+    '''
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., group=1):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        if group==1:
+            self.fc1 = nn.Linear(in_features, hidden_features)
+            self.fc2 = nn.Linear(hidden_features, out_features)
+        else:
+            self.fc1 = GroupLinear(in_features, hidden_features,group)
+            self.fc2 = GroupLinear(hidden_features, out_features,group)
+        self.act = act_layer()
+
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+class GroupNorm(nn.Module):
+    def __init__(self, num_groups, embed_dim, eps=1e-5, affine=True):
+        super().__init__()
+        self.gn = nn.GroupNorm(num_groups, embed_dim,eps,affine)
+
+    def forward(self, x):
+        B,T,C = x.shape
+        x = x.view(B*T,C)
+        x = self.gn(x)
+        x = x.view(B,T,C)
+        return x
+
+
+class Attention(nn.Module):
+    '''
+    Multi-head self-attention
+    from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+    with some modification to support different num_heads and head_dim.
+    '''
+    def __init__(self, dim, num_heads=8, head_dim=None, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        if head_dim is not None:
+            self.head_dim=head_dim
+        else:
+            head_dim = dim // num_heads
+            self.head_dim = head_dim
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, self.head_dim* self.num_heads * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(self.head_dim* self.num_heads, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, padding_mask=None):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        # B,heads,N,C/heads 
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        
+        # trick here to make q@k.t more stable
+        attn = ((q * self.scale) @ k.transpose(-2, -1))
+        if padding_mask is not None:
+            attn = attn.view(B, self.num_heads, N, N)
+            attn = attn.masked_fill(
+                padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
+                float("-inf"),
+            )
+            attn_float = attn.softmax(dim=-1, dtype=torch.float32)
+            attn = attn_float.type_as(attn)
+        else:
+            attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, self.head_dim* self.num_heads)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+        
+class Block(nn.Module):
+    '''
+    Pre-layernorm transformer block
+    '''
+    def __init__(self, dim, num_heads, head_dim=None, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, group=1, skip_lam=1.):
+        super().__init__()
+        self.dim = dim
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+        self.skip_lam = skip_lam
+
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, head_dim=head_dim, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim, hidden_features=self.mlp_hidden_dim, act_layer=act_layer, drop=drop, group=group)
+
+    def forward(self, x, padding_mask=None):
+        x = x + self.drop_path(self.attn(self.norm1(x),padding_mask))/self.skip_lam
+        x = x + self.drop_path(self.mlp(self.norm2(x)))/self.skip_lam
+        return x
+
+    def flops(self, s):
+        heads = self.attn.num_heads
+        h = self.dim
+        i = self.mlp_hidden_dim
+        mha_block_flops = dict(
+        kqv=3 * h * h  ,
+        attention_scores=h * s,
+        attn_softmax=SOFTMAX_FLOPS * s * heads,
+        attention_dropout=DROPOUT_FLOPS * s * heads,
+        attention_scale=s * heads,
+        attention_weighted_avg_values=h * s,
+        attn_output=h * h,
+        attn_output_bias=h,
+        attn_output_dropout=DROPOUT_FLOPS * h,
+        attn_output_residual=h,
+        attn_output_layer_norm=LAYER_NORM_FLOPS * h,)
+        ffn_block_flops = dict(
+        intermediate=h * i,
+        intermediate_act=ACTIVATION_FLOPS * i,
+        intermediate_bias=i,
+        output=h * i,
+        output_bias=h,
+        output_dropout=DROPOUT_FLOPS * h,
+        output_residual=h,
+        output_layer_norm=LAYER_NORM_FLOPS * h,)
+
+        return sum(mha_block_flops.values())*s + sum(ffn_block_flops.values())*s
+
+class MHABlock(nn.Module):
+    """
+    Multihead Attention block with residual branch
+    """
+    def __init__(self, dim, num_heads, head_dim=None, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, group=1, skip_lam=1.):
+        super().__init__()
+        self.dim = dim
+        self.norm1 = norm_layer(dim)
+        self.skip_lam = skip_lam
+        self.attn = Attention(
+            dim, num_heads=num_heads, head_dim=head_dim, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x, padding_mask=None):
+        x = x + self.drop_path(self.attn(self.norm1(x*self.skip_lam), padding_mask))/self.skip_lam
+        return x
+
+    def flops(self, s):
+        heads = self.attn.num_heads
+        h = self.dim
+        block_flops = dict(
+        kqv=3 * h * h ,
+        attention_scores=h * s,
+        attn_softmax=SOFTMAX_FLOPS * s * heads,
+        attention_dropout=DROPOUT_FLOPS * s * heads,
+        attention_scale=s * heads,
+        attention_weighted_avg_values=h * s,
+        attn_output=h * h,
+        attn_output_bias=h,
+        attn_output_dropout=DROPOUT_FLOPS * h,
+        attn_output_residual=h,
+        attn_output_layer_norm=LAYER_NORM_FLOPS * h,)
+
+        return sum(block_flops.values())*s
+
+class FFNBlock(nn.Module):
+    """
+    Feed forward network with residual branch
+    """
+    def __init__(self, dim, num_heads, head_dim=None, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, group=1, skip_lam=1.):
+        super().__init__()
+        self.skip_lam = skip_lam
+        self.dim = dim
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim, hidden_features=self.mlp_hidden_dim, act_layer=act_layer, drop=drop, group=group)
+    def forward(self, x):
+        x = x + self.drop_path(self.mlp(self.norm2(x*self.skip_lam)))/self.skip_lam
+        return x
+    def flops(self, s):
+        heads = self.attn.num_heads
+        h = self.dim
+        i = self.mlp_hidden_dim
+        block_flops = dict(
+        intermediate=h * i,
+        intermediate_act=ACTIVATION_FLOPS * i,
+        intermediate_bias=i,
+        output=h * i,
+        output_bias=h,
+        output_dropout=DROPOUT_FLOPS * h,
+        output_residual=h,
+        output_layer_norm=LAYER_NORM_FLOPS * h,)
+
+        return sum(block_flops.values())*s
+
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+    """
+    def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Conv2d(feature_dim, embed_dim,kernel_size=1)
+
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = self.proj(x)
+        return x
+
+
+class PatchEmbedNaive(nn.Module):
+    """ 
+    Image to Patch Embedding
+    from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        return x
+
+    def flops(self):
+        img_size = self.img_size[0]
+        block_flops = dict(
+        proj=img_size*img_size*3*self.embed_dim,
+        )
+        return sum(block_flops.values())
+
+
+class PatchEmbed4_2(nn.Module):
+    """ 
+    Image to Patch Embedding with 4 layer convolution
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+
+        new_patch_size = to_2tuple(patch_size // 2)
+
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.embed_dim = embed_dim
+
+        self.conv1 = nn.Conv2d(in_chans, 64, kernel_size=7, stride=2, padding=3, bias=False)  # 112x112
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False)  # 112x112
+        self.bn2 = nn.BatchNorm2d(64)
+        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False)  
+        self.bn3 = nn.BatchNorm2d(64)
+
+        self.proj = nn.Conv2d(64, embed_dim, kernel_size=new_patch_size, stride=new_patch_size)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+
+        x = self.conv3(x)
+        x = self.bn3(x)
+        x = self.relu(x)
+
+        x = self.proj(x)  # [B, C, W, H]
+
+        return x
+
+    def flops(self):
+        img_size = self.img_size[0]
+        block_flops = dict(
+        conv1=img_size/2*img_size/2*3*64*7*7,
+        conv2=img_size/2*img_size/2*64*64*3*3,
+        conv3=img_size/2*img_size/2*64*64*3*3,
+        proj=img_size/2*img_size/2*64*self.embed_dim,
+        )
+        return sum(block_flops.values())
+
+    
+class PatchEmbed4_2_128(nn.Module):
+    """ 
+    Image to Patch Embedding with 4 layer convolution and 128 filters
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+
+        new_patch_size = to_2tuple(patch_size // 2)
+
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.embed_dim = embed_dim
+
+        self.conv1 = nn.Conv2d(in_chans, 128, kernel_size=7, stride=2, padding=3, bias=False)  # 112x112
+        self.bn1 = nn.BatchNorm2d(128)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1, bias=False)  # 112x112
+        self.bn2 = nn.BatchNorm2d(128)
+        self.conv3 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1, bias=False)  
+        self.bn3 = nn.BatchNorm2d(128)
+
+        self.proj = nn.Conv2d(128, embed_dim, kernel_size=new_patch_size, stride=new_patch_size)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+
+        x = self.conv3(x)
+        x = self.bn3(x)
+        x = self.relu(x)
+
+        x = self.proj(x)  # [B, C, W, H]
+
+        return x
+    def flops(self):
+        img_size = self.img_size[0]
+        block_flops = dict(
+        conv1=img_size/2*img_size/2*3*128*7*7,
+        conv2=img_size/2*img_size/2*128*128*3*3,
+        conv3=img_size/2*img_size/2*128*128*3*3,
+        proj=img_size/2*img_size/2*128*self.embed_dim,
+        )
+        return sum(block_flops.values())
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/models/lvvit.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/models/lvvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..9be9c2fb1dcd8635c061bbb4181775f5473ff84e
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/models/lvvit.py
@@ -0,0 +1,298 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+import torch
+import torch.nn as nn
+
+from timm.models.helpers import load_pretrained
+from timm.models.registry import register_model
+from timm.models.layers import trunc_normal_
+from timm.models.resnet import resnet26d, resnet50d, resnet101d
+import numpy as np
+
+from .layers import *
+
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225),
+        'classifier': 'head',
+        **kwargs
+    }
+
+default_cfgs = {
+    'LV_ViT_Tiny': _cfg(),
+    'LV_ViT': _cfg(),
+    'LV_ViT_Medium': _cfg(crop_pct=1.0),
+    'LV_ViT_Large': _cfg(crop_pct=1.0),
+}
+
+def get_block(block_type, **kargs):
+    if block_type=='mha':
+        # multi-head attention block
+        return MHABlock(**kargs)
+    elif block_type=='ffn':
+        # feed forward block
+        return FFNBlock(**kargs)
+    elif block_type=='tr':
+        # transformer block
+        return Block(**kargs)
+
+
+def rand_bbox(size, beta=1.0):
+    W = size[2]
+    H = size[3]
+    while True:
+        lam = np.random.beta(beta, beta)
+        cut_rat = np.sqrt(1. - lam)
+        cut_w = np.int(W * cut_rat)
+        cut_h = np.int(H * cut_rat)
+
+        # uniform
+        cx = np.random.randint(W)
+        cy = np.random.randint(H)
+
+        bbx1 = np.clip(cx - cut_w // 2, 0, W)
+        bby1 = np.clip(cy - cut_h // 2, 0, H)
+        bbx2 = np.clip(cx + cut_w // 2, 0, W)
+        bby2 = np.clip(cy + cut_h // 2, 0, H)
+
+        if bbx1 != bbx2 and bby1 != bby2:
+            break
+
+    return bbx1, bby1, bbx2, bby2
+
+
+def get_dpr(drop_path_rate,depth,drop_path_decay='linear'):
+    if drop_path_decay=='linear':
+        # linear dpr decay
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+    elif drop_path_decay=='fix':
+        # use fixed dpr
+        dpr= [drop_path_rate]*depth
+    else:
+        # use predefined drop_path_rate list
+        assert len(drop_path_rate)==depth
+        dpr=drop_path_rate
+    return dpr
+
+
+class LV_ViT(nn.Module):
+    """ Vision Transformer with tricks
+    Arguements:
+        p_emb: different conv based position embedding (default: 4 layer conv)
+        skip_lam: residual scalar for skip connection (default: 1.0)
+        order: which order of layers will be used (default: None, will override depth if given)
+        mix_token: use mix token augmentation for batch of tokens (default: False)
+        return_dense: whether to return feature of all tokens with an additional aux_head (default: False)
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., drop_path_decay='linear', hybrid_backbone=None, norm_layer=nn.LayerNorm, p_emb='4_2', head_dim = None,
+                 skip_lam = 1.0,order=None, mix_token=False, return_dense=False):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.output_dim = embed_dim if num_classes==0 else num_classes
+        if hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
+        else:
+            if p_emb=='4_2':
+                patch_embed_fn = PatchEmbed4_2
+            elif p_emb=='4_2_128':
+                patch_embed_fn = PatchEmbed4_2_128
+            else:
+                patch_embed_fn = PatchEmbedNaive
+
+            self.patch_embed = patch_embed_fn(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if order is None:
+            dpr=get_dpr(drop_path_rate, depth, drop_path_decay)
+            self.blocks = nn.ModuleList([
+                Block(
+                    dim=embed_dim, num_heads=num_heads, head_dim=head_dim, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                    drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, skip_lam=skip_lam)
+                for i in range(depth)])
+        else:
+            # use given order to sequentially generate modules
+            dpr=get_dpr(drop_path_rate, len(order), drop_path_decay)
+            self.blocks = nn.ModuleList([
+                get_block(order[i],
+                    dim=embed_dim, num_heads=num_heads, head_dim=head_dim, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                    drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, skip_lam=skip_lam)
+                for i in range(len(order))])
+
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        
+        self.return_dense=return_dense
+        self.mix_token=mix_token
+
+        if return_dense:
+            self.aux_head=nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        if mix_token:
+            self.beta = 1.0
+            assert return_dense, "always return all features when mixtoken is enabled"
+
+        trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, GroupLinear):
+            trunc_normal_(m.group_weight, std=.02)
+            if isinstance(m, GroupLinear) and m.group_bias is not None:
+                nn.init.constant_(m.group_bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+    
+    def forward_embeddings(self,x):
+        x = self.patch_embed(x)
+        return x
+    def forward_tokens(self, x):
+        B = x.shape[0]
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+        return x
+
+    def forward_features(self,x):
+        # simple forward to obtain feature map (without mixtoken)
+        x = self.forward_embeddings(x)
+        x = x.flatten(2).transpose(1, 2)
+        x = self.forward_tokens(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_embeddings(x)
+
+        # token level mixtoken augmentation 
+        if self.mix_token and self.training:
+            patch_h, patch_w = x.shape[2],x.shape[3]
+            bbx1, bby1, bbx2, bby2 = rand_bbox(x.size(), self.beta)
+            temp_x = x.clone()
+            temp_x[:, :, bbx1:bbx2, bby1:bby2] = x.flip(0)[:, :, bbx1:bbx2, bby1:bby2]
+            x = temp_x
+        else:
+            bbx1, bby1, bbx2, bby2 = 0,0,0,0
+
+        x = x.flatten(2).transpose(1, 2)
+        x = self.forward_tokens(x)
+        x_cls = self.head(x[:,0])
+
+
+        if self.return_dense:
+            # import pdb
+            # pdb.set_trace()
+            x_aux = self.aux_head(x[:,1:])
+            if not self.training:
+                return x_cls+0.5*x_aux.max(1)[0]
+
+            # recover the mixed part
+            if self.mix_token and self.training:
+                x_aux = x_aux.reshape(x_aux.shape[0],patch_h, patch_w,x_aux.shape[-1])
+                temp_x = x_aux.clone()
+                # print("===================python print===================")
+                # print("x_aux shape after clone", x_aux.shape)
+                # print("x_aux stride after clone", x_aux.stride())
+                # print("x_aux format after clone", x_aux.storage().npu_format())
+                # print("temp_x shape after clone", temp_x.shape)
+                # print("temp_x stride after clone", temp_x.stride())
+                # print("temp_x format after clone", temp_x.storage().npu_format())
+                # print("bbx1, bbx2, bby1, bby2: ", bbx1, bbx2, bby1, bby2)
+                # print("x_aux.flip(0)[:, bbx1:bbx2, bby1:bby2, :].shape: ", x_aux.flip(0)[:, bbx1:bbx2, bby1:bby2, :].shape)
+                # print("temp_x[:, bbx1:bbx2, bby1:bby2, :].shape: ", temp_x[:, bbx1:bbx2, bby1:bby2, :].shape)
+                # print("===================python print end===================")
+                temp_x[:, bbx1:bbx2, bby1:bby2, :] = x_aux.flip(0)[:, bbx1:bbx2, bby1:bby2, :]
+                x_aux = temp_x
+                x_aux = x_aux.reshape(x_aux.shape[0],patch_h*patch_w,x_aux.shape[-1])
+                # print("===================python print===================")
+                # print("x_aux shape after reshape", x_aux.shape)
+                # print("x_aux stride after reshape", x_aux.stride())
+                # print("x_aux format after reshape", x_aux.storage().npu_format())
+                # print("===================python print end===================")
+
+            return x_cls, x_aux, (bbx1, bby1, bbx2, bby2)
+        return x_cls
+
+@register_model
+def vit(pretrained=False, **kwargs):
+    model = LV_ViT(patch_size=16, embed_dim=384, depth=16, num_heads=6, mlp_ratio=3.,
+        p_emb=1, **kwargs)
+    model.default_cfg = default_cfgs['LV_ViT']
+    return model
+
+
+@register_model
+def lvvit(pretrained=False, **kwargs):
+    model = LV_ViT(patch_size=16, embed_dim=384, depth=16, num_heads=6, mlp_ratio=3.,
+        p_emb='4_2',skip_lam=2., **kwargs)
+    model.default_cfg = default_cfgs['LV_ViT']
+    return model
+
+@register_model
+def lvvit_s(pretrained=False, **kwargs):
+    model = LV_ViT(patch_size=16, embed_dim=384, depth=16, num_heads=6, mlp_ratio=3.,
+        p_emb='4_2',skip_lam=2., return_dense=True,mix_token=True, **kwargs)
+    model.default_cfg = default_cfgs['LV_ViT']
+    return model
+
+@register_model
+def lvvit_m(pretrained=False, **kwargs):
+    model = LV_ViT(patch_size=16, embed_dim=512, depth=20, num_heads=8, mlp_ratio=3.,
+        p_emb='4_2',skip_lam=2., return_dense=True,mix_token=True, **kwargs)
+    model.default_cfg = default_cfgs['LV_ViT_Medium']
+    return model
+
+
+@register_model
+def lvvit_l(pretrained=False, **kwargs):
+    order = ['tr']*24 # this will override depth, can also be set as None
+    model = LV_ViT(patch_size=16, embed_dim=768,depth=24, num_heads=12, mlp_ratio=3.,
+        p_emb='4_2_128',skip_lam=3., return_dense=True,mix_token=True, order=order, **kwargs)
+    model.default_cfg = default_cfgs['LV_ViT_Large']
+    return model
diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/utils/__init__.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5c755f330d14c4dd47508b3acf377efa0cfbb65
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/utils/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+from .utils import load_pretrained_weights
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/LVVIT/tlt/utils/utils.py b/PyTorch/contrib/cv/classification/LVVIT/tlt/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aa6e03ace681290ba9335e6e6a662a0ab287e21
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/tlt/utils/utils.py
@@ -0,0 +1,158 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+# Adapted for token labeling
+
+'''
+- resize_pos_embed: resize position embedding
+- load_for_transfer_learning: load pretrained paramters to model in transfer learning
+- get_mean_and_std: calculate the mean and std value of dataset.
+'''
+
+import os
+import sys
+import time
+import torch
+import math
+
+import torch.nn as nn
+import torch.nn.init as init
+import logging
+import os
+from collections import OrderedDict
+import torch.nn.functional as F
+
+_logger = logging.getLogger(__name__)
+
+def resize_pos_embed(posemb, posemb_new): # example: 224:(14x14+1)-> 384: (24x24+1)
+    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
+    # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
+    ntok_new = posemb_new.shape[1]
+
+    posemb_tok, posemb_grid = posemb[:, :1], posemb[0, 1:]  # posemb_tok is for cls token, posemb_grid for the following tokens
+    ntok_new -= 1
+    gs_old = int(math.sqrt(len(posemb_grid)))     # 14
+    gs_new = int(math.sqrt(ntok_new))             # 24
+    _logger.info('Position embedding grid-size from %s to %s', gs_old, gs_new)
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)  # [1, 196, dim]->[1, 14, 14, dim]->[1, dim, 14, 14]
+    posemb_grid = F.interpolate(posemb_grid, size=(gs_new, gs_new), mode='bicubic') # [1, dim, 14, 14] -> [1, dim, 24, 24]
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_new * gs_new, -1)   # [1, dim, 24, 24] -> [1, 24*24, dim]
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)   # [1, 24*24+1, dim]
+    return posemb
+
+def resize_pos_embed_without_cls(posemb, posemb_new): # example: 224:(14x14+1)-> 384: (24x24+1)
+    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
+    # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
+    ntok_new = posemb_new.shape[1]
+    posemb_grid = posemb[0]
+    gs_old = int(math.sqrt(len(posemb_grid)))     # 14
+    gs_new = int(math.sqrt(ntok_new))             # 24
+    _logger.info('Position embedding grid-size from %s to %s', gs_old, gs_new)
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)  # [1, 196, dim]->[1, 14, 14, dim]->[1, dim, 14, 14]
+    posemb_grid = F.interpolate(posemb_grid, size=(gs_new, gs_new), mode='bicubic') # [1, dim, 14, 14] -> [1, dim, 24, 24]
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_new * gs_new, -1)   # [1, dim, 24, 24] -> [1, 24*24, dim]
+    return posemb_grid
+
+
+def resize_pos_embed_4d(posemb, posemb_new): # example: 224:(14x14+1)-> 384: (24x24+1)
+    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
+    # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
+    gs_old = posemb.shape[1]     # 14
+    gs_new = posemb_new.shape[1]             # 24
+    _logger.info('Position embedding grid-size from %s to %s', gs_old, gs_new)
+    posemb_grid = posemb
+    posemb_grid = posemb_grid.permute(0, 3, 1, 2)  # [1, 14, 14, dim]->[1, dim, 14, 14]
+    posemb_grid = F.interpolate(posemb_grid, size=(gs_new, gs_new), mode='bicubic') # [1, dim, 14, 14] -> [1, dim, 24, 24]
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1)   # [1, dim, 24, 24]->[1, 24, 24, dim]
+    return posemb_grid
+
+
+def load_state_dict(checkpoint_path,model, use_ema=False, num_classes=1000):
+    if checkpoint_path and os.path.isfile(checkpoint_path):
+        checkpoint = torch.load(checkpoint_path, map_location='cpu')
+        state_dict_key = 'state_dict'
+        if isinstance(checkpoint, dict):
+            if use_ema and 'state_dict_ema' in checkpoint:
+                state_dict_key = 'state_dict_ema'
+        if state_dict_key and state_dict_key in checkpoint:
+            new_state_dict = OrderedDict()
+            for k, v in checkpoint[state_dict_key].items():
+                # strip `module.` prefix
+                name = k[7:] if k.startswith('module') else k
+                new_state_dict[name] = v
+            state_dict = new_state_dict
+        else:
+            state_dict = checkpoint
+        _logger.info("Loaded {} from checkpoint '{}'".format(state_dict_key, checkpoint_path))
+        if num_classes != state_dict['head.bias'].shape[0]:
+            # completely discard fully connected for all other differences between pretrained and created model
+            del state_dict['head.weight']
+            del state_dict['head.bias']
+            old_aux_head_weight = state_dict.pop('aux_head.weight', None)
+            old_aux_head_bias = state_dict.pop('aux_head.bias', None)
+
+
+        old_posemb = state_dict['pos_embed']
+        if model.pos_embed.shape != old_posemb.shape:  # need resize the position embedding by interpolate
+            if len(old_posemb.shape)==3:
+                if int(math.sqrt(old_posemb.shape[1]))**2==old_posemb.shape[1]:
+                    new_posemb = resize_pos_embed_without_cls(old_posemb, model.pos_embed)
+                else:
+                    new_posemb = resize_pos_embed(old_posemb, model.pos_embed)
+            elif len(old_posemb.shape)==4:
+                new_posemb = resize_pos_embed_4d(old_posemb, model.pos_embed)
+            state_dict['pos_embed'] = new_posemb
+
+        return state_dict
+    else:
+        _logger.error("No checkpoint found at '{}'".format(checkpoint_path))
+        raise FileNotFoundError()
+
+
+def load_pretrained_weights(model, checkpoint_path, use_ema=False, strict=True, num_classes=1000):
+    state_dict = load_state_dict(checkpoint_path, model, use_ema, num_classes)
+    model.load_state_dict(state_dict, strict=strict)
+
+
+def get_mean_and_std(dataset):
+    '''Compute the mean and std value of dataset.'''
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2)
+    mean = torch.zeros(3)
+    std = torch.zeros(3)
+    print('==> Computing mean and std..')
+    for inputs, targets in dataloader:
+        for i in range(3):
+            mean[i] += inputs[:,i,:,:].mean()
+            std[i] += inputs[:,i,:,:].std()
+    mean.div_(len(dataset))
+    std.div_(len(dataset))
+    return mean, std
+
+def init_params(net):
+    '''Init layer parameters.'''
+    for m in net.modules():
+        if isinstance(m, nn.Conv2d):
+            init.kaiming_normal(m.weight, mode='fan_out')
+            if m.bias:
+                init.constant(m.bias, 0)
+        elif isinstance(m, nn.BatchNorm2d):
+            init.constant(m.weight, 1)
+            init.constant(m.bias, 0)
+        elif isinstance(m, nn.Linear):
+            init.normal(m.weight, std=1e-3)
+            if m.bias:
+                init.constant(m.bias, 0)
+
diff --git a/PyTorch/contrib/cv/classification/LVVIT/validate.py b/PyTorch/contrib/cv/classification/LVVIT/validate.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fc51aaef37d511919810e468a554b8b5c794b2e
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/validate.py
@@ -0,0 +1,389 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+#!/usr/bin/env python3
+""" ImageNet Validation Script
+Adapted from https://github.com/rwightman/pytorch-image-models
+The script is further extend to evaluate LV-ViT models
+
+"""
+import argparse
+import os
+import csv
+import glob
+import time
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+from collections import OrderedDict
+from contextlib import suppress
+
+from timm.models import create_model, apply_test_time_pool, load_checkpoint, is_model, list_models
+from timm.models.helpers import load_state_dict
+from timm.data import create_dataset, resolve_data_config, RealLabelsImagenet
+from timm.utils import accuracy, AverageMeter, natural_key, setup_default_logging, set_jit_legacy
+from tlt.data import create_loader
+import tlt.models
+
+has_apex = False
+try:
+    from apex import amp
+    has_apex = True
+except ImportError:
+    pass
+
+has_native_amp = False
+try:
+    if getattr(torch.cuda.amp, 'autocast') is not None:
+        has_native_amp = True
+except AttributeError:
+    pass
+
+torch.backends.cudnn.benchmark = True
+_logger = logging.getLogger('validate')
+
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Validation')
+parser.add_argument('data', metavar='DIR',
+                    help='path to dataset')
+parser.add_argument('--dataset', '-d', metavar='NAME', default='',
+                    help='dataset type (default: ImageFolder/ImageTar if empty)')
+parser.add_argument('--split', metavar='NAME', default='validation',
+                    help='dataset split (default: validation)')
+parser.add_argument('--model', '-m', metavar='NAME', default='dpn92',
+                    help='model architecture (default: dpn92)')
+parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                    help='number of data loading workers (default: 2)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+                    metavar='N', help='mini-batch size (default: 256)')
+parser.add_argument('--img-size', default=None, type=int,
+                    metavar='N', help='Input image dimension, uses model default if empty')
+parser.add_argument('--input-size', default=None, nargs=3, type=int,
+                    metavar='N N N', help='Input all image dimensions (d h w, e.g. --input-size 3 224 224), uses model default if empty')
+parser.add_argument('--crop-pct', default=None, type=float,
+                    metavar='N', help='Input image center crop pct')
+parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+                    help='Override mean pixel value of dataset')
+parser.add_argument('--std', type=float,  nargs='+', default=None, metavar='STD',
+                    help='Override std deviation of of dataset')
+parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+                    help='Image resize interpolation type (overrides model)')
+parser.add_argument('--num-classes', type=int, default=None,
+                    help='Number classes in dataset')
+parser.add_argument('--class-map', default='', type=str, metavar='FILENAME',
+                    help='path to class to idx mapping file (default: "")')
+parser.add_argument('--gp', default=None, type=str, metavar='POOL',
+                    help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.')
+parser.add_argument('--log-freq', default=50, type=int,
+                    metavar='N', help='batch logging frequency (default: 10)')
+parser.add_argument('--checkpoint', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+parser.add_argument('--num-gpu', type=int, default=1,
+                    help='Number of GPUS to use')
+parser.add_argument('--no-test-pool', dest='no_test_pool', action='store_true',
+                    help='disable test time pool')
+parser.add_argument('--no-prefetcher', action='store_true', default=False,
+                    help='disable fast prefetcher')
+parser.add_argument('--pin-mem', action='store_true', default=False,
+                    help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+parser.add_argument('--channels-last', action='store_true', default=False,
+                    help='Use channels_last memory layout')
+parser.add_argument('--amp', action='store_true', default=False,
+                    help='Use AMP mixed precision. Defaults to Apex, fallback to native Torch AMP.')
+parser.add_argument('--apex-amp', action='store_true', default=False,
+                    help='Use NVIDIA Apex AMP mixed precision')
+parser.add_argument('--native-amp', action='store_true', default=False,
+                    help='Use Native Torch AMP mixed precision')
+parser.add_argument('--tf-preprocessing', action='store_true', default=False,
+                    help='Use Tensorflow preprocessing pipeline (require CPU TF installed')
+parser.add_argument('--use-ema', dest='use_ema', action='store_true',
+                    help='use ema version of weights if present')
+parser.add_argument('--torchscript', dest='torchscript', action='store_true',
+                    help='convert model torchscript for inference')
+parser.add_argument('--legacy-jit', dest='legacy_jit', action='store_true',
+                    help='use legacy jit mode for pytorch 1.5/1.5.1/1.6 to get back fusion performance')
+parser.add_argument('--results-file', default='', type=str, metavar='FILENAME',
+                    help='Output csv file for validation results (summary)')
+parser.add_argument('--real-labels', default='', type=str, metavar='FILENAME',
+                    help='Real labels JSON file for imagenet evaluation')
+parser.add_argument('--valid-labels', default='', type=str, metavar='FILENAME',
+                    help='Valid label indices txt file for validation of partial label space')
+
+
+def validate(args):
+    # might as well try to validate something
+    args.pretrained = args.pretrained or not args.checkpoint
+    args.prefetcher = not args.no_prefetcher
+    amp_autocast = suppress  # do nothing
+    if args.amp:
+        if has_native_amp:
+            args.native_amp = True
+        elif has_apex:
+            args.apex_amp = True
+        else:
+            _logger.warning("Neither APEX or Native Torch AMP is available.")
+    assert not args.apex_amp or not args.native_amp, "Only one AMP mode should be set."
+    if args.native_amp:
+        amp_autocast = torch.cuda.amp.autocast
+        _logger.info('Validating in mixed precision with native PyTorch AMP.')
+    elif args.apex_amp:
+        _logger.info('Validating in mixed precision with NVIDIA APEX AMP.')
+    else:
+        _logger.info('Validating in float32. AMP not enabled.')
+
+    if args.legacy_jit:
+        set_jit_legacy()
+    device = torch.device(f"npu:0")
+
+    # create model
+    model = create_model(
+        args.model,
+        pretrained=args.pretrained,
+        num_classes=args.num_classes,
+        in_chans=3,
+        global_pool=args.gp,
+        scriptable=args.torchscript,
+        img_size=args.img_size)
+    if args.num_classes is None:
+        assert hasattr(model, 'num_classes'), 'Model must have `num_classes` attr if not set on cmd line/config.'
+        args.num_classes = model.num_classes
+
+    if args.checkpoint:
+        load_checkpoint(model, args.checkpoint, args.use_ema, strict=False)
+
+    param_count = sum([m.numel() for m in model.parameters()])
+    _logger.info('Model %s created, param count: %d' % (args.model, param_count))
+
+    data_config = resolve_data_config(vars(args), model=model, use_test_size=True)
+    test_time_pool = False
+    if not args.no_test_pool:
+        model, test_time_pool = apply_test_time_pool(model, data_config, use_test_size=True)
+
+    if args.torchscript:
+        torch.jit.optimized_execution(True)
+        model = torch.jit.script(model)
+
+    #model = model.cuda()
+    model = model.npu()
+    if args.apex_amp:
+        model = amp.initialize(model, opt_level='O1')
+
+    if args.channels_last:
+        model = model.to(memory_format=torch.channels_last)
+
+    if args.num_gpu > 1:
+        model = torch.nn.DataParallel(model, device_ids=list(range(args.num_gpu)))
+
+    criterion = nn.CrossEntropyLoss().cuda()
+
+    # dataset = create_dataset(
+    #     root=args.data, name=args.dataset, split=args.split,
+    #     load_bytes=args.tf_preprocessing, class_map=args.class_map)
+    
+    dataset = create_dataset(
+       name=args.dataset,root=args.data, split=args.split, is_training=False, batch_size=args.batch_size)
+
+
+    if args.valid_labels:
+        with open(args.valid_labels, 'r') as f:
+            valid_labels = {int(line.rstrip()) for line in f}
+            valid_labels = [i in valid_labels for i in range(args.num_classes)]
+    else:
+        valid_labels = None
+
+    if args.real_labels:
+        real_labels = RealLabelsImagenet(dataset.filenames(basename=True), real_json=args.real_labels)
+    else:
+        real_labels = None
+
+    crop_pct = 1.0 if test_time_pool else data_config['crop_pct']
+
+    '''
+    loader = create_loader(
+        device,
+        dataset,
+        #dataset_train = dataset,
+        input_size=data_config['input_size'],
+        batch_size=args.batch_size,
+        use_prefetcher=args.prefetcher,
+        interpolation=data_config['interpolation'],
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.workers,
+        crop_pct=crop_pct,
+        pin_memory=args.pin_mem,
+        tf_preprocessing=args.tf_preprocessing)
+    '''
+    loader = create_loader(
+        device,
+        dataset,
+        input_size=data_config['input_size'],
+        batch_size=args.batch_size,
+        is_training=False,
+        use_prefetcher=args.prefetcher,
+        interpolation=data_config['interpolation'],
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.workers,
+        #distributed=args.distributed,
+        crop_pct=data_config['crop_pct'],
+        pin_memory=args.pin_mem,
+    )
+
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+    top5 = AverageMeter()
+
+    model.eval()
+    with torch.no_grad():
+        # warmup, reduce variability of first batch time, especially for comparing torchscript vs non
+        input = torch.randn((args.batch_size,) + data_config['input_size']).npu()
+        if args.channels_last:
+            input = input.contiguous(memory_format=torch.channels_last)
+        model(input)
+        end = time.time()
+        for batch_idx, (input, target) in enumerate(loader):
+            if args.no_prefetcher:
+                target = target.cuda()
+                input = input.cuda()
+            if args.channels_last:
+                input = input.contiguous(memory_format=torch.channels_last)
+
+            # compute output
+            with amp_autocast():
+                output = model(input)
+            if isinstance(output, (tuple, list)):
+                output = output[0]
+            if valid_labels is not None:
+                output = output[:, valid_labels]
+            loss = criterion(output, target)
+
+            if real_labels is not None:
+                real_labels.add_result(output)
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output.detach(), target, topk=(1, 5))
+            losses.update(loss.item(), input.size(0))
+            top1.update(acc1.item(), input.size(0))
+            top5.update(acc5.item(), input.size(0))
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if batch_idx % args.log_freq == 0:
+                _logger.info(
+                    'Test: [{0:>4d}/{1}]  '
+                    'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s)  '
+                    'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f})  '
+                    'Acc@1: {top1.val:>7.3f} ({top1.avg:>7.3f})  '
+                    'Acc@5: {top5.val:>7.3f} ({top5.avg:>7.3f})'.format(
+                        batch_idx, len(loader), batch_time=batch_time,
+                        rate_avg=input.size(0) / batch_time.avg,
+                        loss=losses, top1=top1, top5=top5))
+
+    if real_labels is not None:
+        # real labels mode replaces topk values at the end
+        top1a, top5a = real_labels.get_accuracy(k=1), real_labels.get_accuracy(k=5)
+    else:
+        top1a, top5a = top1.avg, top5.avg
+    results = OrderedDict(
+        top1=round(top1a, 4), top1_err=round(100 - top1a, 4),
+        top5=round(top5a, 4), top5_err=round(100 - top5a, 4),
+        param_count=round(param_count / 1e6, 2),
+        img_size=data_config['input_size'][-1],
+        cropt_pct=crop_pct,
+        interpolation=data_config['interpolation'])
+
+    _logger.info(' * Acc@1 {:.3f} ({:.3f}) Acc@5 {:.3f} ({:.3f})'.format(
+       results['top1'], results['top1_err'], results['top5'], results['top5_err']))
+
+    return results
+
+
+def main():
+    setup_default_logging()
+    args = parser.parse_args()
+    model_cfgs = []
+    model_names = []
+    if os.path.isdir(args.checkpoint):
+        # validate all checkpoints in a path with same model
+        checkpoints = glob.glob(args.checkpoint + '/*.pth.tar')
+        checkpoints += glob.glob(args.checkpoint + '/*.pth')
+        model_names = list_models(args.model)
+        model_cfgs = [(args.model, c) for c in sorted(checkpoints, key=natural_key)]
+    else:
+        if args.model == 'all':
+            # validate all models in a list of names with pretrained checkpoints
+            args.pretrained = True
+            model_names = list_models(pretrained=True, exclude_filters=['*in21k'])
+            model_cfgs = [(n, '') for n in model_names]
+        elif not is_model(args.model):
+            # model name doesn't exist, try as wildcard filter
+            model_names = list_models(args.model)
+            model_cfgs = [(n, '') for n in model_names]
+
+    if len(model_cfgs):
+        results_file = args.results_file or './results-all.csv'
+        _logger.info('Running bulk validation on these pretrained models: {}'.format(', '.join(model_names)))
+        results = []
+        try:
+            start_batch_size = args.batch_size
+            for m, c in model_cfgs:
+                batch_size = start_batch_size
+                args.model = m
+                args.checkpoint = c
+                result = OrderedDict(model=args.model)
+                r = {}
+                while not r and batch_size >= args.num_gpu:
+                    torch.cuda.empty_cache()
+                    try:
+                        args.batch_size = batch_size
+                        print('Validating with batch size: %d' % args.batch_size)
+                        r = validate(args)
+                    except RuntimeError as e:
+                        if batch_size <= args.num_gpu:
+                            print("Validation failed with no ability to reduce batch size. Exiting.")
+                            raise e
+                        batch_size = max(batch_size // 2, args.num_gpu)
+                        print("Validation failed, reducing batch size by 50%")
+                result.update(r)
+                if args.checkpoint:
+                    result['checkpoint'] = args.checkpoint
+                results.append(result)
+        except KeyboardInterrupt as e:
+            pass
+        results = sorted(results, key=lambda x: x['top1'], reverse=True)
+        if len(results):
+            write_results(results_file, results)
+    else:
+        validate(args)
+
+
+def write_results(results_file, results):
+    with open(results_file, mode='w') as cf:
+        dw = csv.DictWriter(cf, fieldnames=results[0].keys())
+        dw.writeheader()
+        for r in results:
+            dw.writerow(r)
+        cf.flush()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/classification/LVVIT/visualize/baselines/ViT/LVViT_LRP.py b/PyTorch/contrib/cv/classification/LVVIT/visualize/baselines/ViT/LVViT_LRP.py
new file mode 100644
index 0000000000000000000000000000000000000000..e13780bed2316b7302480387aa1631e5b557a546
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/LVVIT/visualize/baselines/ViT/LVViT_LRP.py
@@ -0,0 +1,505 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#coding=utf-8
+
+""" Vision Transformer (ViT) in PyTorch
+adapted from https://github.com/hila-chefer/Transformer-Explainability/blob/main/baselines/ViT/ViT_LRP.py
+"""
+import torch
+import torch.nn as nn
+from einops import rearrange
+from modules.layers_ours import *
+
+from baselines.ViT.helpers import load_pretrained
+from baselines.ViT.weight_init import trunc_normal_
+from baselines.ViT.layer_helpers import to_2tuple
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'first_conv': 'patch_embed.proj', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    # patch models
+    'lvvit_base_patch16_224': _cfg(
+        url='https://github.com/zihangJiang/TokenLabeling/releases/download/1.0/lvvit_m-56M-224-84.0.pth.tar',
+        mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225),
+    ),
+    'lvvit_small_patch16_224': _cfg(
+        url='https://github.com/zihangJiang/TokenLabeling/releases/download/1.0/lvvit_s-26M-224-83.3.pth.tar',
+        mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225),
+    ),
+    'lvvit_small_patch16_384': _cfg(
+        url='https://github.com/zihangJiang/TokenLabeling/releases/download/1.0/lvvit_s-26M-384-84.4.pth.tar',
+        mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225),
+    )
+}
+
+def compute_rollout_attention(all_layer_matrices, start_layer=0):
+    # adding residual consideration
+    num_tokens = all_layer_matrices[0].shape[1]
+    batch_size = all_layer_matrices[0].shape[0]
+    eye = torch.eye(num_tokens).expand(batch_size, num_tokens, num_tokens).to(all_layer_matrices[0].device)
+    all_layer_matrices = [all_layer_matrices[i] + eye for i in range(len(all_layer_matrices))]
+    # all_layer_matrices = [all_layer_matrices[i] / all_layer_matrices[i].sum(dim=-1, keepdim=True)
+    #                       for i in range(len(all_layer_matrices))]
+    joint_attention = all_layer_matrices[start_layer]
+    for i in range(start_layer+1, len(all_layer_matrices)):
+        joint_attention = all_layer_matrices[i].bmm(joint_attention)
+    return joint_attention
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = Linear(in_features, hidden_features)
+        self.act = GELU()
+        self.fc2 = Linear(hidden_features, out_features)
+        self.drop = Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+    def relprop(self, cam, **kwargs):
+        cam = self.drop.relprop(cam, **kwargs)
+        cam = self.fc2.relprop(cam, **kwargs)
+        cam = self.act.relprop(cam, **kwargs)
+        cam = self.fc1.relprop(cam, **kwargs)
+        return cam
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False,attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = head_dim ** -0.5
+
+        # A = Q*K^T
+        self.matmul1 = einsum('bhid,bhjd->bhij')
+        # attn = A*V
+        self.matmul2 = einsum('bhij,bhjd->bhid')
+
+        self.qkv = Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = Dropout(attn_drop)
+        self.proj = Linear(dim, dim)
+        self.proj_drop = Dropout(proj_drop)
+        self.softmax = Softmax(dim=-1)
+
+        self.attn_cam = None
+        self.attn = None
+        self.v = None
+        self.v_cam = None
+        self.attn_gradients = None
+
+    def get_attn(self):
+        return self.attn
+
+    def save_attn(self, attn):
+        self.attn = attn
+
+    def save_attn_cam(self, cam):
+        self.attn_cam = cam
+
+    def get_attn_cam(self):
+        return self.attn_cam
+
+    def get_v(self):
+        return self.v
+
+    def save_v(self, v):
+        self.v = v
+
+    def save_v_cam(self, cam):
+        self.v_cam = cam
+
+    def get_v_cam(self):
+        return self.v_cam
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def forward(self, x):
+        b, n, _, h = *x.shape, self.num_heads
+        qkv = self.qkv(x)
+        q, k, v = rearrange(qkv, 'b n (qkv h d) -> qkv b h n d', qkv=3, h=h)
+
+        self.save_v(v)
+
+        dots = self.matmul1([q, k]) * self.scale
+
+        attn = self.softmax(dots)
+        attn = self.attn_drop(attn)
+
+        self.save_attn(attn)
+        attn.register_hook(self.save_attn_gradients)
+
+        out = self.matmul2([attn, v])
+        out = rearrange(out, 'b h n d -> b n (h d)')
+
+        out = self.proj(out)
+        out = self.proj_drop(out)
+        return out
+
+    def relprop(self, cam, **kwargs):
+        cam = self.proj_drop.relprop(cam, **kwargs)
+        cam = self.proj.relprop(cam, **kwargs)
+        cam = rearrange(cam, 'b n (h d) -> b h n d', h=self.num_heads)
+
+        # attn = A*V
+        (cam1, cam_v)= self.matmul2.relprop(cam, **kwargs)
+        cam1 /= 2
+        cam_v /= 2
+
+        self.save_v_cam(cam_v)
+        self.save_attn_cam(cam1)
+
+        cam1 = self.attn_drop.relprop(cam1, **kwargs)
+        cam1 = self.softmax.relprop(cam1, **kwargs)
+
+        # A = Q*K^T
+        (cam_q, cam_k) = self.matmul1.relprop(cam1, **kwargs)
+        cam_q /= 2
+        cam_k /= 2
+
+        cam_qkv = rearrange([cam_q, cam_k, cam_v], 'qkv b h n d -> b n (qkv h d)', qkv=3, h=self.num_heads)
+
+        return self.qkv.relprop(cam_qkv, **kwargs)
+
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.):
+        super().__init__()
+        self.norm1 = LayerNorm(dim, eps=1e-6)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.norm2 = LayerNorm(dim, eps=1e-6)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, drop=drop)
+
+        self.add1 = Add()
+        self.add2 = Add()
+        self.clone1 = Clone()
+        self.clone2 = Clone()
+
+    def forward(self, x):
+        x1, x2 = self.clone1(x, 2)
+        x = self.add1([x1, self.attn(self.norm1(x2))/2.0])
+        x1, x2 = self.clone2(x, 2)
+        x = self.add2([x1, self.mlp(self.norm2(x2))/2.0])
+        return x
+
+    def relprop(self, cam, **kwargs):
+        (cam1, cam2) = self.add2.relprop(cam, **kwargs)
+        cam2 = self.mlp.relprop(cam2, **kwargs)
+        cam2 = self.norm2.relprop(cam2, **kwargs)
+        cam = self.clone2.relprop((cam1, cam2), **kwargs)
+
+        (cam1, cam2) = self.add1.relprop(cam, **kwargs)
+        cam2 = self.attn.relprop(cam2, **kwargs)
+        cam2 = self.norm1.relprop(cam2, **kwargs)
+        cam = self.clone1.relprop((cam1, cam2), **kwargs)
+        return cam
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+
+    def relprop(self, cam, **kwargs):
+        cam = cam.transpose(1,2)
+        cam = cam.reshape(cam.shape[0], cam.shape[1],
+                     (self.img_size[0] // self.patch_size[0]), (self.img_size[1] // self.patch_size[1]))
+        return self.proj.relprop(cam, **kwargs)
+
+class PatchEmbed4_2(nn.Module):
+    """ 
+    Image to Patch Embedding with 4 layer convolution
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+
+        new_patch_size = to_2tuple(patch_size // 2)
+
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.embed_dim = embed_dim
+
+        self.conv1 = Conv2d(in_chans, 64, kernel_size=7, stride=2, padding=3, bias=False)  # 112x112
+        self.bn1 = BatchNorm2d(64)
+        self.relu = ReLU(inplace=True)
+        self.conv2 = Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False)  # 112x112
+        self.bn2 = BatchNorm2d(64)
+        self.conv3 = Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False)  
+        self.bn3 = BatchNorm2d(64)
+
+        self.proj = Conv2d(64, embed_dim, kernel_size=new_patch_size, stride=new_patch_size)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+
+        x = self.conv3(x)
+        x = self.bn3(x)
+        x = self.relu(x)
+
+        x = self.proj(x)  # [B, C, W, H]
+
+        return x.flatten(2).transpose(1, 2)
+    def relprop(self, cam, **kwargs):
+        cam = cam.transpose(1,2)
+        cam = cam.reshape(cam.shape[0], cam.shape[1],
+                     (self.img_size[0] // self.patch_size[0]), (self.img_size[1] // self.patch_size[1]))
+        cam = self.proj.relprop(cam, **kwargs)
+        cam = self.bn3.relprop(cam, **kwargs)
+        cam = self.conv3.relprop(cam, **kwargs)
+        cam = self.bn2.relprop(cam, **kwargs)
+        cam = self.conv2.relprop(cam, **kwargs)
+        cam = self.bn1.relprop(cam, **kwargs)
+        cam = self.conv1.relprop(cam, **kwargs)
+        return cam
+        
+class VisionTransformer(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, mlp_head=False, drop_rate=0., attn_drop_rate=0.):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.patch_embed = PatchEmbed4_2(
+                img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
+                drop=drop_rate, attn_drop=attn_drop_rate)
+            for i in range(depth)])
+
+        self.norm = LayerNorm(embed_dim)
+        if mlp_head:
+            # paper diagram suggests 'MLP head', but results in 4M extra parameters vs paper
+            self.head = Mlp(embed_dim, int(embed_dim * mlp_ratio), num_classes)
+        else:
+            # with a single Linear layer as head, the param count within rounding of paper
+            self.head = Linear(embed_dim, num_classes)
+            self.aux_head = Linear(embed_dim, num_classes)
+        # FIXME not quite sure what the proper weight init is supposed to be,
+        # normal / trunc normal w/ std == .02 similar to other Bert like transformers
+        trunc_normal_(self.pos_embed, std=.02)  # embeddings same as weights?
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+
+        self.pool = IndexSelect()
+        self.add = Add()
+
+        self.inp_grad = None
+
+    def save_inp_grad(self,grad):
+        self.inp_grad = grad
+
+    def get_inp_grad(self):
+        return self.inp_grad
+
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @property
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def forward(self, x):
+        B = x.shape[0]
+        x = self.patch_embed(x)
+
+        cls_tokens = self.cls_token.expand(B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = self.add([x, self.pos_embed])
+
+        x.register_hook(self.save_inp_grad)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x = self.norm(x)
+        x = self.pool(x, dim=1, indices=torch.tensor(0, device=x.device))
+        x = x.squeeze(1)
+        x = self.head(x)
+        return x
+
+    def relprop(self, cam=None,method="transformer_attribution", is_ablation=False, start_layer=0, **kwargs):
+        # print(kwargs)
+        # print("conservation 1", cam.sum())
+        cam = self.head.relprop(cam, **kwargs)
+        cam = cam.unsqueeze(1)
+        cam = self.pool.relprop(cam, **kwargs)
+        cam = self.norm.relprop(cam, **kwargs)
+        for blk in reversed(self.blocks):
+            cam = blk.relprop(cam, **kwargs)
+
+        # print("conservation 2", cam.sum())
+        # print("min", cam.min())
+
+        if method == "full":
+            (cam, _) = self.add.relprop(cam, **kwargs)
+            cam = cam[:, 1:]
+            cam = self.patch_embed.relprop(cam, **kwargs)
+            # sum on channels
+            cam = cam.sum(dim=1)
+            return cam
+
+        elif method == "rollout":
+            # cam rollout
+            attn_cams = []
+            for blk in self.blocks:
+                attn_heads = blk.attn.get_attn_cam().clamp(min=0)
+                avg_heads = (attn_heads.sum(dim=1) / attn_heads.shape[1]).detach()
+                attn_cams.append(avg_heads)
+            cam = compute_rollout_attention(attn_cams, start_layer=start_layer)
+            cam = cam[:, 0, 1:]
+            return cam
+        
+        # our method, method name grad is legacy
+        elif method == "transformer_attribution" or method == "grad":
+            cams = []
+            for blk in self.blocks:
+                grad = blk.attn.get_attn_gradients()
+                cam = blk.attn.get_attn_cam()
+                cam = cam[0].reshape(-1, cam.shape[-1], cam.shape[-1])
+                grad = grad[0].reshape(-1, grad.shape[-1], grad.shape[-1])
+                cam = grad * cam
+                cam = cam.clamp(min=0).mean(dim=0)
+                cams.append(cam.unsqueeze(0))
+            rollout = compute_rollout_attention(cams, start_layer=start_layer)
+            cam = rollout[:, 0, 1:]
+            return cam
+            
+        elif method == "last_layer":
+            cam = self.blocks[-1].attn.get_attn_cam()
+            cam = cam[0].reshape(-1, cam.shape[-1], cam.shape[-1])
+            if is_ablation:
+                grad = self.blocks[-1].attn.get_attn_gradients()
+                grad = grad[0].reshape(-1, grad.shape[-1], grad.shape[-1])
+                cam = grad * cam
+            cam = cam.clamp(min=0).mean(dim=0)
+            cam = cam[0, 1:]
+            return cam
+
+        elif method == "last_layer_attn":
+            cam = self.blocks[-1].attn.get_attn()
+            cam = cam[0].reshape(-1, cam.shape[-1], cam.shape[-1])
+            cam = cam.clamp(min=0).mean(dim=0)
+            cam = cam[0, 1:]
+            return cam
+
+        elif method == "second_layer":
+            cam = self.blocks[1].attn.get_attn_cam()
+            cam = cam[0].reshape(-1, cam.shape[-1], cam.shape[-1])
+            if is_ablation:
+                grad = self.blocks[1].attn.get_attn_gradients()
+                grad = grad[0].reshape(-1, grad.shape[-1], grad.shape[-1])
+                cam = grad * cam
+            cam = cam.clamp(min=0).mean(dim=0)
+            cam = cam[0, 1:]
+            return cam
+
+
+def _conv_filter(state_dict, patch_size=16):
+    """ convert patch embedding weight from manual patchify + linear proj to conv"""
+    out_dict = {}
+    for k, v in state_dict.items():
+        if 'patch_embed.proj.weight' in k:
+            v = v.reshape((v.shape[0], 3, patch_size, patch_size))
+        out_dict[k] = v
+    return out_dict
+
+
+def lvvit_base_patch16_224(pretrained=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16, embed_dim=512, depth=20, num_heads=8, mlp_ratio=3, qkv_bias=False, **kwargs)
+    model.default_cfg = default_cfgs['lvvit_base_patch16_224']
+    if pretrained:
+        load_pretrained(model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
+
+def lvvit_small_patch16_224(pretrained=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16, embed_dim=384, depth=16, num_heads=6, mlp_ratio=3, qkv_bias=False, **kwargs)
+    model.default_cfg = default_cfgs['lvvit_small_patch16_224']
+    if pretrained:
+        load_pretrained(model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
+
+def lvvit_small_patch16_384(pretrained=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16, embed_dim=384, depth=16, num_heads=6, mlp_ratio=3, qkv_bias=False, img_size=384, **kwargs)
+    model.default_cfg = default_cfgs['lvvit_small_patch16_384']
+    if pretrained:
+        load_pretrained(model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model