From 8300fae4a10ca0a1631685134bf2b012f3ffd693 Mon Sep 17 00:00:00 2001
From: gp513 <guopeng34@huawei.com>
Date: Wed, 23 Mar 2022 14:48:40 +0800
Subject: [PATCH] Tranformer-SSL for pytorch first commit

fix scripts

fix test problems

modify 1p performance

remove redundant annotation
---
 .../Transformer-SSL_for_PyTorch/LICENSE       |  21 +
 .../Transformer-SSL_for_PyTorch/README.md     | 101 +++
 .../Transformer-SSL_for_PyTorch/README_raw.md | 119 +++
 .../Transformer-SSL_for_PyTorch/config.py     | 266 +++++++
 .../swin_base_patch4_window12_384.yaml        |  13 +
 .../swin_base_patch4_window7_224.yaml         |   9 +
 .../swin_large_patch4_window12_384.yaml       |  13 +
 .../swin_large_patch4_window7_224.yaml        |   9 +
 .../swin_small_patch4_window7_224.yaml        |   9 +
 .../swin_tiny_patch4_window7_224.yaml         |   9 +
 .../configs/moby_deit_small.yaml              |  19 +
 .../configs/moby_swin_tiny.yaml               |  24 +
 .../configs/moby_swin_tiny_bn.yaml            |  25 +
 .../data/__init__.py                          |   1 +
 .../Transformer-SSL_for_PyTorch/data/build.py | 189 +++++
 .../data/cached_image_folder.py               | 258 ++++++
 .../data/custom_image_folder.py               |  32 +
 .../data/samplers.py                          |  29 +
 .../data/zipreader.py                         | 103 +++
 .../Transformer-SSL_for_PyTorch/env_npu.sh    |  81 ++
 .../get_started.md                            | 199 +++++
 .../Transformer-SSL_for_PyTorch/logger.py     |  41 +
 .../lr_scheduler.py                           | 102 +++
 .../Transformer-SSL_for_PyTorch/main.py       | 347 ++++++++
 .../moby_linear.py                            | 386 +++++++++
 .../Transformer-SSL_for_PyTorch/moby_main.py  | 283 +++++++
 .../models/__init__.py                        |   1 +
 .../models/build.py                           |  75 ++
 .../models/moby.py                            | 272 +++++++
 .../models/swin_transformer.py                | 750 ++++++++++++++++++
 .../Transformer-SSL_for_PyTorch/optimizer.py  |  58 ++
 .../run8p_linear_evaluation.sh                |   2 +
 .../run8p_pretrain.sh                         |  14 +
 .../test/env_npu.sh                           |  81 ++
 .../test/eval_8p.sh                           | 122 +++
 .../test/train_full_8p.sh                     | 151 ++++
 .../test/train_performance_1p.sh              | 116 +++
 .../test/train_performance_8p.sh              | 152 ++++
 .../Transformer-SSL_for_PyTorch/utils.py      | 114 +++
 39 files changed, 4596 insertions(+)
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/LICENSE
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/README.md
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/README_raw.md
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/config.py
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_base_patch4_window12_384.yaml
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_base_patch4_window7_224.yaml
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_large_patch4_window12_384.yaml
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_large_patch4_window7_224.yaml
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_small_patch4_window7_224.yaml
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_tiny_patch4_window7_224.yaml
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/moby_deit_small.yaml
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/moby_swin_tiny.yaml
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/moby_swin_tiny_bn.yaml
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/__init__.py
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/build.py
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/cached_image_folder.py
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/custom_image_folder.py
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/samplers.py
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/zipreader.py
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/env_npu.sh
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/get_started.md
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/logger.py
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/lr_scheduler.py
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/main.py
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/moby_linear.py
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/moby_main.py
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/models/__init__.py
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/models/build.py
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/models/moby.py
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/models/swin_transformer.py
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/optimizer.py
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/run8p_linear_evaluation.sh
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/run8p_pretrain.sh
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/env_npu.sh
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/eval_8p.sh
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_full_8p.sh
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_performance_1p.sh
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_performance_8p.sh
 create mode 100644 PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/utils.py

diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/LICENSE b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/LICENSE
new file mode 100644
index 0000000000..9e841e7a26
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/LICENSE
@@ -0,0 +1,21 @@
+    MIT License
+
+    Copyright (c) Microsoft Corporation.
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/README.md b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/README.md
new file mode 100644
index 0000000000..82b0a668e7
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/README.md
@@ -0,0 +1,101 @@
+# MoBY with Swin Transformer, Self-Supervised Pre-training and ImageNet-1K Linear Evaluation
+
+This folder contains the implementation of the `MoBY` with `Swin Transformer` for image classification.
+
+## Usage
+
+### Install
+
+- Install ASCEND-CANN, ASCEND-pytorch-1.5 and apex.
+
+- Install `timm==0.3.2`:
+
+```bash
+pip install timm==0.3.2
+```
+
+- Install other requirements:
+
+```bash
+pip install opencv-python==4.4.0.46 termcolor==1.1.0 yacs==0.1.8 diffdist
+```
+
+### Data preparation
+
+We use standard ImageNet dataset.
+
+- For standard folder dataset, move validation images to labeled sub-folders. The file structure should look like:
+  ```bash
+  $ tree data
+  imagenet 
+  ├── train
+  │   ├── class1
+  │   │   ├── img1.jpeg
+  │   │   ├── img2.jpeg
+  │   │   └── ...
+  │   ├── class2
+  │   │   ├── img3.jpeg
+  │   │   └── ...
+  │   └── ...
+  └── val
+      ├── class1
+      │   ├── img4.jpeg
+      │   ├── img5.jpeg
+      │   └── ...
+      ├── class2
+      │   ├── img6.jpeg
+      │   └── ...
+      └── ...
+ 
+  ```
+
+### Self-Supervised Pre-training
+
+To train `MoBY` with `Swin Transformer Tiny` on ImageNet, run:
+
+```bash
+bash ./test/train_full_8p.sh --data_path=<data_path>
+```
+
+For example, to train `MoBY` with `Swin Transformer Tiny` with 8 NPU on a single node for 300 epochs, run:
+
+```bash
+bash ./test/train_full_8p.sh --data_path=/data/imagenet
+```
+
+Defaultly, training auto-resumes checkpoint in output directory. Remove the `output` directory to train from begin.
+
+### Performance Test
+
+To train `MoBY Swin-T` on 1 NPU for performance test, run:
+
+```bash
+bash ./test/train_performance_1p.sh --data_path=<data_path>
+```
+
+For performance test on 8 NPU, run:
+
+```bash
+bash ./test/train_performance_8p.sh --data_path=<data_path>
+```
+
+### Linear Evaluation (for accuracy test)
+
+To evaluate a pre-trained `MoBY` with `Swin Transformer Tiny` on ImageNet-1K linear evaluation, run:
+
+```bash
+bash ./test/eval_8p.sh --data_path=<data_path>
+```
+
+For example, to evaluate `MoBY Swin-T` with 8 NPU on a single node on ImageNet-1K linear evluation, run:
+
+```bash
+bash ./test/eval_8p.sh --data_path=/data/imagenet
+```
+
+### Training result for `MoBY Swin-T`
+
+| Acc@1    | FPS       | Npu_nums | Epochs   | AMP_Type | CPU |
+| :------: | :------:  | :------: | :------: | :------: |:------:|
+| -        | 140      | 1        | 1        | O1       | ARM |
+| 74.14    | 1113      | 8        | 300      | O1       | ARM |
\ No newline at end of file
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/README_raw.md b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/README_raw.md
new file mode 100644
index 0000000000..e220da069a
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/README_raw.md
@@ -0,0 +1,119 @@
+# Self-Supervised Learning with Vision Transformers
+
+By [Zhenda Xie](https://github.com/zdaxie/)\*, [Yutong Lin](https://github.com/impiga)\*, [Zhuliang Yao](https://github.com/Howal), [Zheng Zhang](https://stupidzz.github.io/), [Qi Dai](https://www.microsoft.com/en-us/research/people/qid/), [Yue Cao](http://yue-cao.me) and [Han Hu](https://ancientmooner.github.io/)
+
+This repo is the official implementation of ["Self-Supervised Learning with Swin Transformers"](https://arxiv.org/abs/2105.04553). 
+
+**A important feature of this codebase is to include `Swin Transformer` as one of the backbones, such that we can evaluate the transferring performance of the learnt representations on down-stream tasks of object detection and semantic segmentation.** This evaluation is usually not included in previous works due to the use of ViT/DeiT, which has not been well tamed for down-stream tasks.
+
+It currently includes code and models for the following tasks:
+
+> **Self-Supervised Learning and Linear Evaluation**: Included in this repo. See [get_started.md](get_started.md) for a quick start.
+
+> **Transferring Performance on Object Detection/Instance Segmentation**: See [Swin Transformer for Object Detection](https://github.com/SwinTransformer/Swin-Transformer-Object-Detection).
+
+> **Transferring Performance on Semantic Segmentation**: See [Swin Transformer for Semantic Segmentation](https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation).
+
+## Highlights
+
+- **Include down-stream evaluation**: the `first work` to evaluate the transferring performance on down-stream tasks for SSL using Transformers
+- **Small tricks**: significantly less tricks than previous works, such as MoCo v3 and DINO
+- **High accuracy on ImageNet-1K linear evaluation**: 72.8 vs 72.5 (MoCo v3) vs 72.5 (DINO) using DeiT-S/16 and 300 epoch pre-training
+
+## Updates
+
+***05/13/2021***
+
+1. Self-Supervised models with DeiT-Small on ImageNet-1K ([MoBY-DeiT-Small-300Ep-Pretrained](https://drive.google.com/file/d/18GtBXPPoofyPtNjDk0I3nk5nUb6Fj5HY/view?usp=sharing), [MoBY-DeiT-Small-300Ep-Linear](https://drive.google.com/file/d/1AjjGfM7Wtfxdl3rqqOqcZ8i4j4u08Psr/view?usp=sharing)) are provided.
+2. The supporting code and config for self-supervised learning with DeiT-Small are provided.
+
+***05/11/2021***
+
+Initial Commits:
+1. Self-Supervised Pre-training models on ImageNet-1K ([MoBY-Swin-T-300Ep-Pretrained](https://drive.google.com/file/d/1PS1Q0tAnUfBWLRPxh9iUrinAxeq7Y--u/view?usp=sharing), [MoBY-Swin-T-300Ep-Linear](https://drive.google.com/file/d/1gbQynZy07uXPO-c0tOLeyG1pQzlnVHx9/view?usp=sharing)) are provided.
+2. The supported code and models for self-supervised pre-training and ImageNet-1K linear evaluation, COCO object detection and ADE20K semantic segmentation are provided.
+
+## Introduction
+
+### MoBY: a self-supervised learning approach by combining MoCo v2 and BYOL
+
+**MoBY** (the name `MoBY` stands for **Mo**Co v2 with **BY**OL) is initially described in [arxiv](https://arxiv.org/abs/2105.04553), which is a combination of two popular self-supervised learning approaches: MoCo v2 and BYOL. It inherits the momentum design, the key queue, and the contrastive loss used in MoCo v2, and inherits the asymmetric encoders, asymmetric data augmentations and the momentum scheduler in BYOL.
+
+**MoBY** achieves reasonably high accuracy on ImageNet-1K linear evaluation: 72.8\% and 75.3\% top-1 accuracy using DeiT and Swin-T, respectively, by 300-epoch training. The performance is on par with recent works of MoCo v3 and DINO which adopt DeiT as the backbone, but with much lighter tricks. 
+
+![teaser_moby](figures/teaser_moby.png)
+
+### Swin Transformer as a backbone
+
+**Swin Transformer** (the name `Swin` stands for **S**hifted **win**dow) is initially described in [arxiv](https://arxiv.org/abs/2103.14030), which capably serves as a general-purpose backbone for computer vision. It achieves strong performance on COCO object detection (`58.7 box AP` and `51.1 mask AP` on test-dev) and ADE20K semantic segmentation (`53.5 mIoU` on val), surpassing previous models by a large margin.
+
+We involve Swin Transformer as one of backbones to evaluate the transferring performance on down-stream tasks such as object detection. This differentiate this codebase with other approaches studying SSL on Transformer architectures.
+
+## ImageNet-1K linear evaluation
+
+
+|      Method      | Architecture | Epochs | Params | FLOPs | img/s | Top-1 Accuracy |                                                 Pre-trained Checkpoint                                                 |                                                   Linear Checkpoint                                                    |
+| :--------------: | :----------: | :----: | :----: | :---: | :---: | :------------: | :--------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------: |
+|    Supervised    |    Swin-T    |  300   |  28M   | 4.5G  | 755.2 |      81.2      |         [Here](https://github.com/microsoft/Swin-Transformer#main-results-on-imagenet-with-pretrained-models)          |
+|       MoBY       |    Swin-T    |  100   |  28M   | 4.5G  | 755.2 |      70.9      |                                                        [TBA]()                                                         |
+| MoBY<sup>1</sup> |    Swin-T    |  100   |  28M   | 4.5G  | 755.2 |      72.0      |                                                        [TBA]()                                                         |
+|       MoBY       |    DeiT-S    |  300   |  22M   | 4.6G  | 940.4 |      72.8      | [GoogleDrive](https://drive.google.com/file/d/18GtBXPPoofyPtNjDk0I3nk5nUb6Fj5HY/view?usp=sharing)/[GitHub](https://github.com/SwinTransformer/storage/releases/download/v1.0.3/moby_swin_t_300ep_pretrained.pth)/[Baidu](https://pan.baidu.com/s/18u1lmHHcis85VlH0lqQwHQ) | [GoogleDrive](https://drive.google.com/file/d/1AjjGfM7Wtfxdl3rqqOqcZ8i4j4u08Psr/view?usp=sharing)/[GitHub](https://github.com/SwinTransformer/storage/releases/download/v1.0.3/moby_swin_t_300ep_linear.pth)/[Baidu](https://pan.baidu.com/s/1OHLlV4gPsfS7twos4LfxrA) |
+|       MoBY       |    Swin-T    |  300   |  28M   | 4.5G  | 755.2 |      75.3      | [GoogleDrive](https://drive.google.com/file/d/1PS1Q0tAnUfBWLRPxh9iUrinAxeq7Y--u/view?usp=sharing)/[GitHub](https://github.com/SwinTransformer/storage/releases/download/v1.0.3/moby_deit_small_300ep_pretrained.pth)/[Baidu](https://pan.baidu.com/s/1u3mMrJ4sPQ0t5J_e0l_YnQ) | [GoogleDrive](https://drive.google.com/file/d/1gbQynZy07uXPO-c0tOLeyG1pQzlnVHx9/view?usp=sharing)/[GitHub](https://github.com/SwinTransformer/storage/releases/download/v1.0.3/moby_deit_small_300ep_linear.pth)/[Baidu](https://pan.baidu.com/s/1NZiBcHS2nuHFGHX0D52Y6w) |
+
+- <sup>1</sup> denotes the result of MoBY which has adopted a trick from MoCo v3 that replace theLayerNorm layers before the MLP blocks by BatchNorm.
+
+- Access code for `baidu` is `moby`.
+
+
+## Transferring to Downstream Tasks
+
+**COCO Object Detection (2017 val)**
+
+| Backbone |       Method       | Model | Schd. | box mAP | mask mAP | Params | FLOPs |
+| :------: | :----------------: | :---: | :---: | :-----: | :------: | :----: | :---: |
+|  Swin-T  |     Mask R-CNN     | Sup.  |  1x   |  43.7   |   39.8   |  48M   | 267G  |
+|  Swin-T  |     Mask R-CNN     | MoBY  |  1x   |  43.6   |   39.6   |  48M   | 267G  |
+|  Swin-T  |     Mask R-CNN     | Sup.  |  3x   |  46.0   |   41.6   |  48M   | 267G  |
+|  Swin-T  |     Mask R-CNN     | MoBY  |  3x   |  46.0   |   41.7   |  48M   | 267G  |
+|  Swin-T  | Cascade Mask R-CNN | Sup.  |  1x   |  48.1   |   41.7   |  86M   | 745G  |
+|  Swin-T  | Cascade Mask R-CNN | MoBY  |  1x   |  48.1   |   41.5   |  86M   | 745G  |
+|  Swin-T  | Cascade Mask R-CNN | Sup.  |  3x   |  50.4   |   43.7   |  86M   | 745G  |
+|  Swin-T  | Cascade Mask R-CNN | MoBY  |  3x   |  50.2   |   43.5   |  86M   | 745G  |
+
+**ADE20K Semantic Segmentation (val)**
+
+| Backbone | Method  | Model | Crop Size | Schd. | mIoU  | mIoU (ms+flip) | Params | FLOPs |
+| :------: | :-----: | :---: | :-------: | :---: | :---: | :------------: | :----: | :---: |
+|  Swin-T  | UPerNet | Sup.  |  512x512  | 160K  | 44.51 |     45.81      |  60M   | 945G  |
+|  Swin-T  | UPerNet | MoBY  |  512x512  | 160K  | 44.06 |     45.58      |  60M   | 945G  |
+
+
+## Citing MoBY and Swin
+
+### MoBY
+
+```
+@article{xie2021moby,
+  title={Self-Supervised Learning with Swin Transformers}, 
+  author={Zhenda Xie and Yutong Lin and Zhuliang Yao and Zheng Zhang and Qi Dai and Yue Cao and Han Hu},
+  journal={arXiv preprint arXiv:2105.04553},
+  year={2021}
+}
+```
+
+### Swin Transformer
+
+```
+@article{liu2021Swin,
+  title={Swin Transformer: Hierarchical Vision Transformer using Shifted Windows},
+  author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},
+  journal={arXiv preprint arXiv:2103.14030},
+  year={2021}
+}
+```
+
+## Getting Started
+
+- For **Self-Supervised Pre-training and Linear Evaluation with MoBY and Swin Transformer**, please see [get_started.md](get_started.md) for detailed instructions.
+- For **Transferring Performance on Object Detection/Instance Segmentation**, please see [Swin Transformer for Object Detection](https://github.com/SwinTransformer/Swin-Transformer-Object-Detection).
+- For **Transferring Performance on Semantic Segmentation**, please see [Swin Transformer for Semantic Segmentation](https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation).
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/config.py b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/config.py
new file mode 100644
index 0000000000..09105589e5
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/config.py
@@ -0,0 +1,266 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# Modified by Zhenda Xie
+# --------------------------------------------------------'
+
+import os
+import yaml
+from yacs.config import CfgNode as CN
+
+_C = CN()
+
+# Base config files
+_C.BASE = ['']
+
+# -----------------------------------------------------------------------------
+# Data settings
+# -----------------------------------------------------------------------------
+_C.DATA = CN()
+# Batch size for a single GPU, could be overwritten by command line argument
+_C.DATA.BATCH_SIZE = 64
+# Path to dataset, could be overwritten by command line argument
+_C.DATA.DATA_PATH = ''
+# Dataset name
+_C.DATA.DATASET = 'imagenet'
+# Input image size
+_C.DATA.IMG_SIZE = 224
+# Interpolation to resize image (random, bilinear, bicubic)
+_C.DATA.INTERPOLATION = 'bicubic'
+# Use zipped dataset instead of folder dataset
+# could be overwritten by command line argument
+_C.DATA.ZIP_MODE = False
+# Cache Data in Memory, could be overwritten by command line argument
+_C.DATA.CACHE_MODE = 'part'
+# Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.
+_C.DATA.PIN_MEMORY = True
+# Number of data loading threads
+_C.DATA.NUM_WORKERS = 8
+
+# -----------------------------------------------------------------------------
+# Model settings
+# -----------------------------------------------------------------------------
+_C.MODEL = CN()
+# Model type
+_C.MODEL.TYPE = 'swin'
+# Model name
+_C.MODEL.NAME = 'swin_tiny_patch4_window7_224'
+# Checkpoint to resume, could be overwritten by command line argument
+_C.MODEL.RESUME = ''
+# Number of classes, overwritten in data preparation
+_C.MODEL.NUM_CLASSES = 1000
+# Dropout rate
+_C.MODEL.DROP_RATE = 0.0
+# Drop path rate
+_C.MODEL.DROP_PATH_RATE = 0.1
+# Label Smoothing
+_C.MODEL.LABEL_SMOOTHING = 0.1
+
+# Swin Transformer parameters
+_C.MODEL.SWIN = CN()
+_C.MODEL.SWIN.PATCH_SIZE = 4
+_C.MODEL.SWIN.IN_CHANS = 3
+_C.MODEL.SWIN.EMBED_DIM = 96
+_C.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
+_C.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
+_C.MODEL.SWIN.WINDOW_SIZE = 7
+_C.MODEL.SWIN.MLP_RATIO = 4.
+_C.MODEL.SWIN.QKV_BIAS = True
+_C.MODEL.SWIN.QK_SCALE = None
+_C.MODEL.SWIN.APE = False
+_C.MODEL.SWIN.PATCH_NORM = True
+# Normalization layers in SwinTransformerBlock before MLP, default: 'ln', choice: ['ln', 'bn']
+_C.MODEL.SWIN.NORM_BEFORE_MLP = 'ln'
+
+# MoBY parameters
+_C.MODEL.MOBY = CN()
+_C.MODEL.MOBY.ENCODER = 'swin'
+_C.MODEL.MOBY.ONLINE_DROP_PATH_RATE = 0.1
+_C.MODEL.MOBY.TARGET_DROP_PATH_RATE = 0.0
+_C.MODEL.MOBY.CONTRAST_MOMENTUM = 0.99
+_C.MODEL.MOBY.CONTRAST_TEMPERATURE = 0.2
+_C.MODEL.MOBY.CONTRAST_NUM_NEGATIVE = 4096
+_C.MODEL.MOBY.PROJ_NUM_LAYERS = 2
+_C.MODEL.MOBY.PRED_NUM_LAYERS = 2
+
+# -----------------------------------------------------------------------------
+# Training settings
+# -----------------------------------------------------------------------------
+_C.TRAIN = CN()
+_C.TRAIN.START_EPOCH = 0
+_C.TRAIN.EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 20
+_C.TRAIN.WEIGHT_DECAY = 0.05
+_C.TRAIN.BASE_LR = 5e-4
+_C.TRAIN.WARMUP_LR = 5e-7
+_C.TRAIN.MIN_LR = 5e-6
+# Clip gradient norm
+_C.TRAIN.CLIP_GRAD = 5.0
+# Auto resume from latest checkpoint
+_C.TRAIN.AUTO_RESUME = True
+# Gradient accumulation steps
+# could be overwritten by command line argument
+_C.TRAIN.ACCUMULATION_STEPS = 0
+# Whether to use gradient checkpointing to save memory
+# could be overwritten by command line argument
+_C.TRAIN.USE_CHECKPOINT = False
+
+# LR scheduler
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'cosine'
+# Epoch interval to decay LR, used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30
+# LR decay rate, used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1
+
+# Optimizer
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'adamw'
+# Optimizer Epsilon
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+# Optimizer Betas
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)
+# SGD momentum
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+# -----------------------------------------------------------------------------
+# Linear eval settings
+# -----------------------------------------------------------------------------
+_C.LINEAR_EVAL = CN()
+_C.LINEAR_EVAL.PRETRAINED = ''
+
+# -----------------------------------------------------------------------------
+# Augmentation settings
+# -----------------------------------------------------------------------------
+_C.AUG = CN()
+# Color jitter factor
+_C.AUG.COLOR_JITTER = 0.4
+# Use AutoAugment policy. "v0" or "original"
+_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1'
+# Random erase prob
+_C.AUG.REPROB = 0.25
+# Random erase mode
+_C.AUG.REMODE = 'pixel'
+# Random erase count
+_C.AUG.RECOUNT = 1
+# Mixup alpha, mixup enabled if > 0
+_C.AUG.MIXUP = 0.8
+# Cutmix alpha, cutmix enabled if > 0
+_C.AUG.CUTMIX = 1.0
+# Cutmix min/max ratio, overrides alpha and enables cutmix if set
+_C.AUG.CUTMIX_MINMAX = None
+# Probability of performing mixup or cutmix when either/both is enabled
+_C.AUG.MIXUP_PROB = 1.0
+# Probability of switching to cutmix when both mixup and cutmix enabled
+_C.AUG.MIXUP_SWITCH_PROB = 0.5
+# How to apply mixup/cutmix params. Per "batch", "pair", or "elem"
+_C.AUG.MIXUP_MODE = 'batch'
+# Self-Supervised Learning Augmentation
+_C.AUG.SSL_AUG = False
+# SSL-Aug type
+_C.AUG.SSL_AUG_TYPE = 'byol'
+# SSL-Aug crop
+_C.AUG.SSL_AUG_CROP = 0.08
+# Self-Supervised Learning Linear Evaluation Augmentation
+_C.AUG.SSL_LINEAR_AUG = False
+
+# -----------------------------------------------------------------------------
+# Testing settings
+# -----------------------------------------------------------------------------
+_C.TEST = CN()
+# Whether to use center crop when testing
+_C.TEST.CROP = True
+
+# -----------------------------------------------------------------------------
+# Misc
+# -----------------------------------------------------------------------------
+# Mixed precision opt level, if O0, no amp is used ('O0', 'O1', 'O2')
+# overwritten by command line argument
+_C.AMP_OPT_LEVEL = ''
+# Path to output folder, overwritten by command line argument
+_C.OUTPUT = ''
+# Tag of experiment, overwritten by command line argument
+_C.TAG = 'default'
+# Frequency to save checkpoint
+_C.SAVE_FREQ = 1
+# Frequency to logging info
+_C.PRINT_FREQ = 10
+# Fixed random seed
+_C.SEED = 0
+# Perform evaluation only, overwritten by command line argument
+_C.EVAL_MODE = False
+# Test throughput only, overwritten by command line argument
+_C.THROUGHPUT_MODE = False
+# local rank for DistributedDataParallel, given by command line argument
+_C.LOCAL_RANK = 0
+
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as f:
+        yaml_cfg = yaml.load(f, Loader=yaml.FullLoader)
+
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('=> merge config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+
+def update_config(config, args):
+    _update_config_from_file(config, args.cfg)
+
+    config.defrost()
+    if args.opts:
+        config.merge_from_list(args.opts)
+
+    # merge from specific arguments
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.zip:
+        config.DATA.ZIP_MODE = True
+    if args.cache_mode:
+        config.DATA.CACHE_MODE = args.cache_mode
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+    if args.accumulation_steps:
+        config.TRAIN.ACCUMULATION_STEPS = args.accumulation_steps
+    if args.use_checkpoint:
+        config.TRAIN.USE_CHECKPOINT = True
+    if args.amp_opt_level:
+        config.AMP_OPT_LEVEL = args.amp_opt_level
+    if args.output:
+        config.OUTPUT = args.output
+    if args.tag:
+        config.TAG = args.tag
+    if args.eval:
+        config.EVAL_MODE = True
+    if args.throughput:
+        config.THROUGHPUT_MODE = True
+    if args.epochs:
+        config.TRAIN.EPOCHS = args.epochs
+
+    # set local rank for distributed training
+    config.LOCAL_RANK = args.local_rank
+
+    # output folder
+    config.OUTPUT = os.path.join(config.OUTPUT, config.MODEL.NAME, config.TAG)
+
+    config.freeze()
+
+
+def get_config(args):
+    """Get a yacs CfgNode object with default values."""
+    # Return a clone so that the defaults will not be altered
+    # This is for the "local variable" use pattern
+    config = _C.clone()
+    update_config(config, args)
+
+    return config
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_base_patch4_window12_384.yaml b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_base_patch4_window12_384.yaml
new file mode 100644
index 0000000000..b54deb781c
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_base_patch4_window12_384.yaml
@@ -0,0 +1,13 @@
+# only for evaluation
+DATA:
+  IMG_SIZE: 384
+MODEL:
+  TYPE: swin
+  NAME: swin_base_patch4_window12_384
+  SWIN:
+    EMBED_DIM: 128
+    DEPTHS: [ 2, 2, 18, 2 ]
+    NUM_HEADS: [ 4, 8, 16, 32 ]
+    WINDOW_SIZE: 12
+TEST:
+  CROP: False
\ No newline at end of file
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_base_patch4_window7_224.yaml b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_base_patch4_window7_224.yaml
new file mode 100644
index 0000000000..b29612858b
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_base_patch4_window7_224.yaml
@@ -0,0 +1,9 @@
+MODEL:
+  TYPE: swin
+  NAME: swin_base_patch4_window7_224
+  DROP_PATH_RATE: 0.5
+  SWIN:
+    EMBED_DIM: 128
+    DEPTHS: [ 2, 2, 18, 2 ]
+    NUM_HEADS: [ 4, 8, 16, 32 ]
+    WINDOW_SIZE: 7
\ No newline at end of file
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_large_patch4_window12_384.yaml b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_large_patch4_window12_384.yaml
new file mode 100644
index 0000000000..bacf5f6a17
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_large_patch4_window12_384.yaml
@@ -0,0 +1,13 @@
+# only for evaluation
+DATA:
+  IMG_SIZE: 384
+MODEL:
+  TYPE: swin
+  NAME: swin_large_patch4_window12_384
+  SWIN:
+    EMBED_DIM: 192
+    DEPTHS: [ 2, 2, 18, 2 ]
+    NUM_HEADS: [ 6, 12, 24, 48 ]
+    WINDOW_SIZE: 12
+TEST:
+  CROP: False
\ No newline at end of file
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_large_patch4_window7_224.yaml b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_large_patch4_window7_224.yaml
new file mode 100644
index 0000000000..df8af4cecb
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_large_patch4_window7_224.yaml
@@ -0,0 +1,9 @@
+# only for evaluation
+MODEL:
+  TYPE: swin
+  NAME: swin_large_patch4_window7_224
+  SWIN:
+    EMBED_DIM: 192
+    DEPTHS: [ 2, 2, 18, 2 ]
+    NUM_HEADS: [ 6, 12, 24, 48 ]
+    WINDOW_SIZE: 7
\ No newline at end of file
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_small_patch4_window7_224.yaml b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_small_patch4_window7_224.yaml
new file mode 100644
index 0000000000..8f5c40fda6
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_small_patch4_window7_224.yaml
@@ -0,0 +1,9 @@
+MODEL:
+  TYPE: swin
+  NAME: swin_small_patch4_window7_224
+  DROP_PATH_RATE: 0.3
+  SWIN:
+    EMBED_DIM: 96
+    DEPTHS: [ 2, 2, 18, 2 ]
+    NUM_HEADS: [ 3, 6, 12, 24 ]
+    WINDOW_SIZE: 7
\ No newline at end of file
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_tiny_patch4_window7_224.yaml b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_tiny_patch4_window7_224.yaml
new file mode 100644
index 0000000000..851c7451f6
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/configs_archived/swin_tiny_patch4_window7_224.yaml
@@ -0,0 +1,9 @@
+MODEL:
+  TYPE: swin
+  NAME: swin_tiny_patch4_window7_224
+  DROP_PATH_RATE: 0.2
+  SWIN:
+    EMBED_DIM: 96
+    DEPTHS: [ 2, 2, 6, 2 ]
+    NUM_HEADS: [ 3, 6, 12, 24 ]
+    WINDOW_SIZE: 7
\ No newline at end of file
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/moby_deit_small.yaml b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/moby_deit_small.yaml
new file mode 100644
index 0000000000..8499acb0da
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/moby_deit_small.yaml
@@ -0,0 +1,19 @@
+TRAIN:
+  WARMUP_EPOCHS: 5
+  EPOCHS: 300
+  BASE_LR: 0.001
+  WEIGHT_DECAY: 0.05
+AUG:
+  SSL_AUG: True
+MODEL:
+  TYPE: moby
+  NAME: moby__deit_small__odpr02_tdpr0_cm099_ct02_queue4096_proj2_pred2
+  MOBY:
+    ENCODER: deit_small
+    ONLINE_DROP_PATH_RATE: 0.2
+    TARGET_DROP_PATH_RATE: 0.0
+    CONTRAST_MOMENTUM: 0.99
+    CONTRAST_TEMPERATURE: 0.2
+    CONTRAST_NUM_NEGATIVE: 4096
+    PROJ_NUM_LAYERS: 2
+    PRED_NUM_LAYERS: 2
\ No newline at end of file
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/moby_swin_tiny.yaml b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/moby_swin_tiny.yaml
new file mode 100644
index 0000000000..26a2bc1480
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/moby_swin_tiny.yaml
@@ -0,0 +1,24 @@
+TRAIN:
+  WARMUP_EPOCHS: 5
+  EPOCHS: 300
+  BASE_LR: 0.001
+  WEIGHT_DECAY: 0.05
+AUG:
+  SSL_AUG: True
+MODEL:
+  TYPE: moby
+  NAME: moby__swin_tiny__patch4_window7_224__odpr02_tdpr0_cm099_ct02_queue4096_proj2_pred2
+  SWIN:
+    EMBED_DIM: 96
+    DEPTHS: [ 2, 2, 6, 2 ]
+    NUM_HEADS: [ 3, 6, 12, 24 ]
+    WINDOW_SIZE: 7
+  MOBY:
+    ENCODER: swin
+    ONLINE_DROP_PATH_RATE: 0.2
+    TARGET_DROP_PATH_RATE: 0.0
+    CONTRAST_MOMENTUM: 0.99
+    CONTRAST_TEMPERATURE: 0.2
+    CONTRAST_NUM_NEGATIVE: 4096
+    PROJ_NUM_LAYERS: 2
+    PRED_NUM_LAYERS: 2
\ No newline at end of file
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/moby_swin_tiny_bn.yaml b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/moby_swin_tiny_bn.yaml
new file mode 100644
index 0000000000..df45c2db95
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/configs/moby_swin_tiny_bn.yaml
@@ -0,0 +1,25 @@
+TRAIN:
+  WARMUP_EPOCHS: 5
+  EPOCHS: 300
+  BASE_LR: 0.001
+  WEIGHT_DECAY: 0.05
+AUG:
+  SSL_AUG: True
+MODEL:
+  TYPE: moby
+  NAME: moby__swin_tiny__patch4_window7_224_bn__odpr02_tdpr0_cm099_ct02_queue4096_proj2_pred2
+  SWIN:
+    EMBED_DIM: 96
+    DEPTHS: [ 2, 2, 6, 2 ]
+    NUM_HEADS: [ 3, 6, 12, 24 ]
+    WINDOW_SIZE: 7
+    NORM_BEFORE_MLP: bn
+  MOBY:
+    ENCODER: swin
+    ONLINE_DROP_PATH_RATE: 0.2
+    TARGET_DROP_PATH_RATE: 0.0
+    CONTRAST_MOMENTUM: 0.99
+    CONTRAST_TEMPERATURE: 0.2
+    CONTRAST_NUM_NEGATIVE: 4096
+    PROJ_NUM_LAYERS: 2
+    PRED_NUM_LAYERS: 2
\ No newline at end of file
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/__init__.py b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/__init__.py
new file mode 100644
index 0000000000..70c633ce61
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/__init__.py
@@ -0,0 +1 @@
+from .build import build_loader
\ No newline at end of file
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/build.py b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/build.py
new file mode 100644
index 0000000000..7fe7800a0d
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/build.py
@@ -0,0 +1,189 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# Modified by Zhenda Xie
+# --------------------------------------------------------
+
+import os
+import torch
+import numpy as np
+from PIL import ImageFilter, ImageOps
+import torch.distributed as dist
+from torchvision import datasets, transforms
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from timm.data import Mixup
+from timm.data import create_transform
+from timm.data.transforms import _pil_interp
+
+from .cached_image_folder import CachedImageFolder
+from .custom_image_folder import CustomImageFolder
+from .samplers import SubsetRandomSampler
+
+
+def build_loader(config):
+    config.defrost()
+    dataset_train, config.MODEL.NUM_CLASSES = build_dataset(is_train=True, config=config)
+    config.freeze()
+    print(f"local rank {config.LOCAL_RANK} / global rank {dist.get_rank()} successfully build train dataset")
+    dataset_val, _ = build_dataset(is_train=False, config=config)
+    print(f"local rank {config.LOCAL_RANK} / global rank {dist.get_rank()} successfully build val dataset")
+
+    num_tasks = dist.get_world_size()
+    global_rank = dist.get_rank()
+    if config.DATA.ZIP_MODE and config.DATA.CACHE_MODE == 'part':
+        indices = np.arange(dist.get_rank(), len(dataset_train), dist.get_world_size())
+        sampler_train = SubsetRandomSampler(indices)
+    else:
+        sampler_train = torch.utils.data.DistributedSampler(
+            dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True
+        )
+
+    indices = np.arange(dist.get_rank(), len(dataset_val), dist.get_world_size())
+    sampler_val = SubsetRandomSampler(indices)
+
+    data_loader_train = torch.utils.data.DataLoader(
+        dataset_train, sampler=sampler_train,
+        batch_size=config.DATA.BATCH_SIZE,
+        num_workers=config.DATA.NUM_WORKERS,
+        pin_memory=config.DATA.PIN_MEMORY,
+        drop_last=True,
+    )
+
+    data_loader_val = torch.utils.data.DataLoader(
+        dataset_val, sampler=sampler_val,
+        batch_size=config.DATA.BATCH_SIZE,
+        shuffle=False,
+        num_workers=config.DATA.NUM_WORKERS,
+        pin_memory=config.DATA.PIN_MEMORY,
+        drop_last=False
+    )
+
+    # setup mixup / cutmix
+    mixup_fn = None
+    mixup_active = config.AUG.MIXUP > 0 or config.AUG.CUTMIX > 0. or config.AUG.CUTMIX_MINMAX is not None
+    if mixup_active:
+        mixup_fn = Mixup(
+            mixup_alpha=config.AUG.MIXUP, cutmix_alpha=config.AUG.CUTMIX, cutmix_minmax=config.AUG.CUTMIX_MINMAX,
+            prob=config.AUG.MIXUP_PROB, switch_prob=config.AUG.MIXUP_SWITCH_PROB, mode=config.AUG.MIXUP_MODE,
+            label_smoothing=config.MODEL.LABEL_SMOOTHING, num_classes=config.MODEL.NUM_CLASSES)
+
+    return dataset_train, dataset_val, data_loader_train, data_loader_val, mixup_fn
+
+
+def build_dataset(is_train, config):
+    transform = build_transform(is_train, config)
+    if config.DATA.DATASET == 'imagenet':
+        prefix = 'train' if is_train else 'val'
+        if config.DATA.ZIP_MODE:
+            ann_file = prefix + "_map.txt"
+            prefix = prefix + ".zip@/"
+            dataset = CachedImageFolder(config.DATA.DATA_PATH, ann_file, prefix, transform,
+                                        cache_mode=config.DATA.CACHE_MODE if is_train else 'part')
+        else:
+            # ToDo: test custom_image_folder
+            root = os.path.join(config.DATA.DATA_PATH, prefix)
+            dataset = CustomImageFolder(root, transform=transform)
+        nb_classes = 1000
+    else:
+        raise NotImplementedError("We only support ImageNet Now.")
+
+    return dataset, nb_classes
+
+
+def build_transform(is_train, config):
+    if config.AUG.SSL_AUG:
+        if config.AUG.SSL_AUG_TYPE == 'byol':
+            normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+            
+            transform_1 = transforms.Compose([
+                transforms.RandomResizedCrop(config.DATA.IMG_SIZE, scale=(config.AUG.SSL_AUG_CROP, 1.)),
+                transforms.RandomHorizontalFlip(),
+                transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.2, 0.1)], p=0.8),
+                transforms.RandomGrayscale(p=0.2),
+                transforms.RandomApply([GaussianBlur()], p=1.0),
+                transforms.ToTensor(),
+                normalize,
+            ])
+            transform_2 = transforms.Compose([
+                transforms.RandomResizedCrop(config.DATA.IMG_SIZE, scale=(config.AUG.SSL_AUG_CROP, 1.)),
+                transforms.RandomHorizontalFlip(),
+                transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.2, 0.1)], p=0.8),
+                transforms.RandomGrayscale(p=0.2),
+                transforms.RandomApply([GaussianBlur()], p=0.1),
+                transforms.RandomApply([ImageOps.solarize], p=0.2),
+                transforms.ToTensor(),
+                normalize,
+            ])
+            
+            transform = (transform_1, transform_2)
+            return transform
+        else:
+            raise NotImplementedError
+    
+    if config.AUG.SSL_LINEAR_AUG:
+        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        
+        if is_train:
+            transform = transforms.Compose([
+                transforms.RandomResizedCrop(config.DATA.IMG_SIZE),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                normalize,
+            ])
+        else:
+            transform = transforms.Compose([
+                transforms.Resize(config.DATA.IMG_SIZE + 32),
+                transforms.CenterCrop(config.DATA.IMG_SIZE),
+                transforms.ToTensor(),
+                normalize,
+            ])
+        return [transform]
+    
+    resize_im = config.DATA.IMG_SIZE > 32
+    if is_train:
+        # this should always dispatch to transforms_imagenet_train
+        transform = create_transform(
+            input_size=config.DATA.IMG_SIZE,
+            is_training=True,
+            color_jitter=config.AUG.COLOR_JITTER if config.AUG.COLOR_JITTER > 0 else None,
+            auto_augment=config.AUG.AUTO_AUGMENT if config.AUG.AUTO_AUGMENT != 'none' else None,
+            re_prob=config.AUG.REPROB,
+            re_mode=config.AUG.REMODE,
+            re_count=config.AUG.RECOUNT,
+            interpolation=config.DATA.INTERPOLATION,
+        )
+        if not resize_im:
+            # replace RandomResizedCropAndInterpolation with
+            # RandomCrop
+            transform.transforms[0] = transforms.RandomCrop(config.DATA.IMG_SIZE, padding=4)
+        return transform
+
+    t = []
+    if resize_im:
+        if config.TEST.CROP:
+            size = int((256 / 224) * config.DATA.IMG_SIZE)
+            t.append(
+                transforms.Resize(size, interpolation=_pil_interp(config.DATA.INTERPOLATION)),
+                # to maintain same ratio w.r.t. 224 images
+            )
+            t.append(transforms.CenterCrop(config.DATA.IMG_SIZE))
+        else:
+            t.append(
+                transforms.Resize((config.DATA.IMG_SIZE, config.DATA.IMG_SIZE),
+                                  interpolation=_pil_interp(config.DATA.INTERPOLATION))
+            )
+
+    t.append(transforms.ToTensor())
+    t.append(transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD))
+    return transforms.Compose(t)
+
+
+class GaussianBlur(object):
+    """Gaussian Blur version 2"""
+
+    def __call__(self, x):
+        sigma = np.random.uniform(0.1, 2.0)
+        x = x.filter(ImageFilter.GaussianBlur(radius=sigma))
+        return x
\ No newline at end of file
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/cached_image_folder.py b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/cached_image_folder.py
new file mode 100644
index 0000000000..994c68c228
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/cached_image_folder.py
@@ -0,0 +1,258 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# Modified by Zhenda Xie
+# --------------------------------------------------------
+
+import io
+import os
+import time
+import torch.distributed as dist
+import torch.utils.data as data
+from PIL import Image
+
+from .zipreader import is_zip_path, ZipReader
+
+
+def has_file_allowed_extension(filename, extensions):
+    """Checks if a file is an allowed extension.
+    Args:
+        filename (string): path to a file
+    Returns:
+        bool: True if the filename ends with a known image extension
+    """
+    filename_lower = filename.lower()
+    return any(filename_lower.endswith(ext) for ext in extensions)
+
+
+def find_classes(dir):
+    classes = [d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]
+    classes.sort()
+    class_to_idx = {classes[i]: i for i in range(len(classes))}
+    return classes, class_to_idx
+
+
+def make_dataset(dir, class_to_idx, extensions):
+    images = []
+    dir = os.path.expanduser(dir)
+    for target in sorted(os.listdir(dir)):
+        d = os.path.join(dir, target)
+        if not os.path.isdir(d):
+            continue
+
+        for root, _, fnames in sorted(os.walk(d)):
+            for fname in sorted(fnames):
+                if has_file_allowed_extension(fname, extensions):
+                    path = os.path.join(root, fname)
+                    item = (path, class_to_idx[target])
+                    images.append(item)
+
+    return images
+
+
+def make_dataset_with_ann(ann_file, img_prefix, extensions):
+    images = []
+    with open(ann_file, "r") as f:
+        contents = f.readlines()
+        for line_str in contents:
+            path_contents = [c for c in line_str.split('\t')]
+            im_file_name = path_contents[0]
+            class_index = int(path_contents[1])
+
+            assert str.lower(os.path.splitext(im_file_name)[-1]) in extensions
+            item = (os.path.join(img_prefix, im_file_name), class_index)
+
+            images.append(item)
+
+    return images
+
+
+class DatasetFolder(data.Dataset):
+    """A generic data loader where the samples are arranged in this way: ::
+        root/class_x/xxx.ext
+        root/class_x/xxy.ext
+        root/class_x/xxz.ext
+        root/class_y/123.ext
+        root/class_y/nsdf3.ext
+        root/class_y/asd932_.ext
+    Args:
+        root (string): Root directory path.
+        loader (callable): A function to load a sample given its path.
+        extensions (list[string]): A list of allowed extensions.
+        transform (callable, optional): A function/transform that takes in
+            a sample and returns a transformed version.
+            E.g, ``transforms.RandomCrop`` for images.
+        target_transform (callable, optional): A function/transform that takes
+            in the target and transforms it.
+     Attributes:
+        samples (list): List of (sample path, class_index) tuples
+    """
+
+    def __init__(self, root, loader, extensions, ann_file='', img_prefix='', transform=None, target_transform=None,
+                 cache_mode="no"):
+        # image folder mode
+        if ann_file == '':
+            _, class_to_idx = find_classes(root)
+            samples = make_dataset(root, class_to_idx, extensions)
+        # zip mode
+        else:
+            samples = make_dataset_with_ann(os.path.join(root, ann_file),
+                                            os.path.join(root, img_prefix),
+                                            extensions)
+
+        if len(samples) == 0:
+            raise (RuntimeError("Found 0 files in subfolders of: " + root + "\n" +
+                                "Supported extensions are: " + ",".join(extensions)))
+
+        self.root = root
+        self.loader = loader
+        self.extensions = extensions
+
+        self.samples = samples
+        self.labels = [y_1k for _, y_1k in samples]
+        self.classes = list(set(self.labels))
+
+        self.transform = transform
+        self.target_transform = target_transform
+
+        self.cache_mode = cache_mode
+        if self.cache_mode != "no":
+            self.init_cache()
+
+    def init_cache(self):
+        assert self.cache_mode in ["part", "full"]
+        n_sample = len(self.samples)
+        global_rank = dist.get_rank()
+        world_size = dist.get_world_size()
+
+        samples_bytes = [None for _ in range(n_sample)]
+        start_time = time.time()
+        for index in range(n_sample):
+            if index % (n_sample // 10) == 0:
+                t = time.time() - start_time
+                print(f'global_rank {dist.get_rank()} cached {index}/{n_sample} takes {t:.2f}s per block')
+                start_time = time.time()
+            path, target = self.samples[index]
+            if self.cache_mode == "full":
+                samples_bytes[index] = (ZipReader.read(path), target)
+            elif self.cache_mode == "part" and index % world_size == global_rank:
+                samples_bytes[index] = (ZipReader.read(path), target)
+            else:
+                samples_bytes[index] = (path, target)
+        self.samples = samples_bytes
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (sample, target) where target is class_index of the target class.
+        """
+        path, target = self.samples[index]
+        sample = self.loader(path)
+        if self.transform is not None:
+            sample = self.transform(sample)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return sample, target
+
+    def __len__(self):
+        return len(self.samples)
+
+    def __repr__(self):
+        fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
+        fmt_str += '    Number of datapoints: {}\n'.format(self.__len__())
+        fmt_str += '    Root Location: {}\n'.format(self.root)
+        tmp = '    Transforms (if any): '
+        fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
+        tmp = '    Target Transforms (if any): '
+        fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
+        return fmt_str
+
+
+IMG_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif']
+
+
+def pil_loader(path):
+    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
+    if isinstance(path, bytes):
+        img = Image.open(io.BytesIO(path))
+    elif is_zip_path(path):
+        data = ZipReader.read(path)
+        img = Image.open(io.BytesIO(data))
+    else:
+        with open(path, 'rb') as f:
+            img = Image.open(f)
+    return img.convert('RGB')
+
+
+def accimage_loader(path):
+    import accimage
+    try:
+        return accimage.Image(path)
+    except IOError:
+        # Potentially a decoding problem, fall back to PIL.Image
+        return pil_loader(path)
+
+
+def default_img_loader(path):
+    from torchvision import get_image_backend
+    if get_image_backend() == 'accimage':
+        return accimage_loader(path)
+    else:
+        return pil_loader(path)
+
+
+class CachedImageFolder(DatasetFolder):
+    """A generic data loader where the images are arranged in this way: ::
+        root/dog/xxx.png
+        root/dog/xxy.png
+        root/dog/xxz.png
+        root/cat/123.png
+        root/cat/nsdf3.png
+        root/cat/asd932_.png
+    Args:
+        root (string): Root directory path.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        loader (callable, optional): A function to load an image given its path.
+     Attributes:
+        imgs (list): List of (image path, class_index) tuples
+    """
+
+    def __init__(self, root, ann_file='', img_prefix='', transform=None, target_transform=None,
+                 loader=default_img_loader, cache_mode="no"):
+        super(CachedImageFolder, self).__init__(root, loader, IMG_EXTENSIONS,
+                                                ann_file=ann_file, img_prefix=img_prefix,
+                                                transform=transform, target_transform=target_transform,
+                                                cache_mode=cache_mode)
+        self.imgs = self.samples
+        if not isinstance(self.transform, (tuple, list)) and self.transform is not None:
+            self.transform = [self.transform]
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (image, target) where target is class_index of the target class.
+        """
+        path, target = self.samples[index]
+        image = self.loader(path)
+        
+        ret = []
+        if self.transform is not None:
+            for t in self.transform:
+                ret.append(t(image))
+        else:
+            ret.append(image)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        ret.append(target)
+
+        return ret
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/custom_image_folder.py b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/custom_image_folder.py
new file mode 100644
index 0000000000..3235e012b5
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/custom_image_folder.py
@@ -0,0 +1,32 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Zhenda Xie
+# --------------------------------------------------------
+
+from torchvision import datasets
+
+
+class CustomImageFolder(datasets.ImageFolder):
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (image, target) where target is class_index of the target class.
+        """
+        path, target = self.samples[index]
+        image = self.loader(path)
+        
+        ret = []
+        if self.transform is not None:
+            for t in self.transform:
+                ret.append(t(image))
+        else:
+            ret.append(image)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        ret.append(target)
+
+        return ret
\ No newline at end of file
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/samplers.py b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/samplers.py
new file mode 100644
index 0000000000..596e220990
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/samplers.py
@@ -0,0 +1,29 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------
+
+import torch
+
+
+class SubsetRandomSampler(torch.utils.data.Sampler):
+    r"""Samples elements randomly from a given list of indices, without replacement.
+
+    Arguments:
+        indices (sequence): a sequence of indices
+    """
+
+    def __init__(self, indices):
+        self.epoch = 0
+        self.indices = indices
+
+    def __iter__(self):
+        return (self.indices[i] for i in torch.randperm(len(self.indices)))
+
+    def __len__(self):
+        return len(self.indices)
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/zipreader.py b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/zipreader.py
new file mode 100644
index 0000000000..060bc46a76
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/data/zipreader.py
@@ -0,0 +1,103 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------
+
+import os
+import zipfile
+import io
+import numpy as np
+from PIL import Image
+from PIL import ImageFile
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+
+def is_zip_path(img_or_path):
+    """judge if this is a zip path"""
+    return '.zip@' in img_or_path
+
+
+class ZipReader(object):
+    """A class to read zipped files"""
+    zip_bank = dict()
+
+    def __init__(self):
+        super(ZipReader, self).__init__()
+
+    @staticmethod
+    def get_zipfile(path):
+        zip_bank = ZipReader.zip_bank
+        if path not in zip_bank:
+            zfile = zipfile.ZipFile(path, 'r')
+            zip_bank[path] = zfile
+        return zip_bank[path]
+
+    @staticmethod
+    def split_zip_style_path(path):
+        pos_at = path.index('@')
+        assert pos_at != -1, "character '@' is not found from the given path '%s'" % path
+
+        zip_path = path[0: pos_at]
+        folder_path = path[pos_at + 1:]
+        folder_path = str.strip(folder_path, '/')
+        return zip_path, folder_path
+
+    @staticmethod
+    def list_folder(path):
+        zip_path, folder_path = ZipReader.split_zip_style_path(path)
+
+        zfile = ZipReader.get_zipfile(zip_path)
+        folder_list = []
+        for file_foler_name in zfile.namelist():
+            file_foler_name = str.strip(file_foler_name, '/')
+            if file_foler_name.startswith(folder_path) and \
+                    len(os.path.splitext(file_foler_name)[-1]) == 0 and \
+                    file_foler_name != folder_path:
+                if len(folder_path) == 0:
+                    folder_list.append(file_foler_name)
+                else:
+                    folder_list.append(file_foler_name[len(folder_path) + 1:])
+
+        return folder_list
+
+    @staticmethod
+    def list_files(path, extension=None):
+        if extension is None:
+            extension = ['.*']
+        zip_path, folder_path = ZipReader.split_zip_style_path(path)
+
+        zfile = ZipReader.get_zipfile(zip_path)
+        file_lists = []
+        for file_foler_name in zfile.namelist():
+            file_foler_name = str.strip(file_foler_name, '/')
+            if file_foler_name.startswith(folder_path) and \
+                    str.lower(os.path.splitext(file_foler_name)[-1]) in extension:
+                if len(folder_path) == 0:
+                    file_lists.append(file_foler_name)
+                else:
+                    file_lists.append(file_foler_name[len(folder_path) + 1:])
+
+        return file_lists
+
+    @staticmethod
+    def read(path):
+        zip_path, path_img = ZipReader.split_zip_style_path(path)
+        zfile = ZipReader.get_zipfile(zip_path)
+        data = zfile.read(path_img)
+        return data
+
+    @staticmethod
+    def imread(path):
+        zip_path, path_img = ZipReader.split_zip_style_path(path)
+        zfile = ZipReader.get_zipfile(zip_path)
+        data = zfile.read(path_img)
+        try:
+            im = Image.open(io.BytesIO(data))
+        except:
+            print("ERROR IMG LOADED: ", path_img)
+            random_img = np.random.rand(224, 224, 3) * 255
+            im = Image.fromarray(np.uint8(random_img))
+        return im
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/env_npu.sh b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/env_npu.sh
new file mode 100644
index 0000000000..c5f14bc83e
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/env_npu.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+export install_path=/usr/local/Ascend
+
+if [ -d ${install_path}/toolkit ]; then
+    export LD_LIBRARY_PATH=/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH}
+    export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH
+    export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH
+    export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH
+    export ASCEND_OPP_PATH=${install_path}/opp
+else
+    if [ -d ${install_path}/nnae/latest ];then
+        export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/nnae/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/nnae/latest
+    else
+        export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest
+    fi
+fi
+
+
+#将Host日志输出到串口,0-关闭/1-开启
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+#设置默认日志级别,0-debug/1-info/2-warning/3-error
+export ASCEND_GLOBAL_LOG_LEVEL=3
+#设置Event日志开启标志,0-关闭/1-开启
+export ASCEND_GLOBAL_EVENT_ENABLE=0
+#设置是否开启taskque,0-关闭/1-开启
+export TASK_QUEUE_ENABLE=1
+#设置是否开启PTCopy,0-关闭/1-开启
+export PTCOPY_ENABLE=1
+#设置是否开启combined标志,0-关闭/1-开启
+export COMBINED_ENABLE=1
+#设置特殊场景是否需要重新编译,不需要修改
+export DYNAMIC_OP="ADD#MUL"
+#HCCL白名单开关,1-关闭/0-开启
+export HCCL_WHITELIST_DISABLE=1
+export HCCL_IF_IP=$(hostname -I |awk '{print $1}')
+
+export SCALAR_TO_HOST_MEM=1
+export BMMV2_ENABLE=1
+
+#设置device侧日志登记为error
+${install_path}/driver/tools/msnpureport -g error -d 0
+${install_path}/driver/tools/msnpureport -g error -d 1
+${install_path}/driver/tools/msnpureport -g error -d 2
+${install_path}/driver/tools/msnpureport -g error -d 3
+${install_path}/driver/tools/msnpureport -g error -d 4
+${install_path}/driver/tools/msnpureport -g error -d 5
+${install_path}/driver/tools/msnpureport -g error -d 6
+${install_path}/driver/tools/msnpureport -g error -d 7
+#关闭Device侧Event日志
+${install_path}/driver/tools/msnpureport -e disable
+
+path_lib=$(python3.7 -c """
+import sys
+import re
+result=''
+for index in range(len(sys.path)):
+    match_sit = re.search('-packages', sys.path[index])
+    if match_sit is not None:
+        match_lib = re.search('lib', sys.path[index])
+
+        if match_lib is not None:
+            end=match_lib.span()[1]
+            result += sys.path[index][0:end] + ':'
+
+        result+=sys.path[index] + '/torch/lib:'
+print(result)"""
+)
+
+echo ${path_lib}
+
+export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/get_started.md b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/get_started.md
new file mode 100644
index 0000000000..d9d2877453
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/get_started.md
@@ -0,0 +1,199 @@
+# MoBY with Swin Transformer, Self-Supervised Pre-training and ImageNet-1K Linear Evaluation
+
+This folder contains the implementation of the `MoBY` with `Swin Transformer` for image classification.
+
+## Model Zoo
+
+### ImageNet-1K Linear Evaluation Results
+
+|      Method      | Architecture | Epochs | Params | FLOPs | img/s | Top-1 Accuracy |                                                 Pre-trained Checkpoint                                                 |                                                   Linear Checkpoint                                                    |
+| :--------------: | :----------: | :----: | :----: | :---: | :---: | :------------: | :--------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------: |
+|    Supervised    |    Swin-T    |  300   |  28M   | 4.5G  | 755.2 |      81.2      |         [Here](https://github.com/microsoft/Swin-Transformer#main-results-on-imagenet-with-pretrained-models)          |
+|       MoBY       |    Swin-T    |  100   |  28M   | 4.5G  | 755.2 |      70.9      |                                                        [TBA]()                                                         |
+| MoBY<sup>1</sup> |    Swin-T    |  100   |  28M   | 4.5G  | 755.2 |      72.0      |                                                        [TBA]()                                                         |
+|       MoBY       |    DeiT-S    |  300   |  22M   | 4.6G  | 940.4 |      72.8      | [GoogleDrive](https://drive.google.com/file/d/18GtBXPPoofyPtNjDk0I3nk5nUb6Fj5HY/view?usp=sharing)/[GitHub]()/[Baidu]() | [GoogleDrive](https://drive.google.com/file/d/1AjjGfM7Wtfxdl3rqqOqcZ8i4j4u08Psr/view?usp=sharing)/[GitHub]()/[Baidu]() |
+|       MoBY       |    Swin-T    |  300   |  28M   | 4.5G  | 755.2 |      75.3      | [GoogleDrive](https://drive.google.com/file/d/1PS1Q0tAnUfBWLRPxh9iUrinAxeq7Y--u/view?usp=sharing)/[GitHub]()/[Baidu]() | [GoogleDrive](https://drive.google.com/file/d/1gbQynZy07uXPO-c0tOLeyG1pQzlnVHx9/view?usp=sharing)/[GitHub]()/[Baidu]() |
+
+- <sup>1</sup> denotes the result of MoBY which has adopted a trick from MoCo v3 that replace theLayerNorm layers before the MLP blocks by BatchNorm.
+
+## Usage
+
+### Install
+
+- Clone this repo:
+
+```bash
+git clone https://github.com/Swin-Transformer/Transformer-SSL
+cd Transformer-SSL
+```
+
+- Create a conda virtual environment and activate it:
+
+```bash
+conda create -n transformer-ssl python=3.7 -y
+conda activate transformer-ssl
+```
+
+- Install `CUDA==10.1` with `cudnn7` following
+  the [official installation instructions](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)
+- Install `PyTorch==1.7.1` and `torchvision==0.8.2` with `CUDA==10.1`:
+
+```bash
+conda install pytorch==1.7.1 torchvision==0.8.2 cudatoolkit=10.1 -c pytorch
+```
+
+- Install `timm==0.3.2`:
+
+```bash
+pip install timm==0.3.2
+```
+
+- Install `Apex`:
+
+```bash
+git clone https://github.com/NVIDIA/apex
+cd apex
+pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+```
+
+- Install other requirements:
+
+```bash
+pip install opencv-python==4.4.0.46 termcolor==1.1.0 yacs==0.1.8 diffdist
+```
+
+### Data preparation
+
+We use standard ImageNet dataset, you can download it from http://image-net.org/. We provide the following two ways to load data:
+
+- For standard folder dataset, move validation images to labeled sub-folders. The file structure should look like:
+  ```bash
+  $ tree data
+  imagenet 
+  ├── train
+  │   ├── class1
+  │   │   ├── img1.jpeg
+  │   │   ├── img2.jpeg
+  │   │   └── ...
+  │   ├── class2
+  │   │   ├── img3.jpeg
+  │   │   └── ...
+  │   └── ...
+  └── val
+      ├── class1
+      │   ├── img4.jpeg
+      │   ├── img5.jpeg
+      │   └── ...
+      ├── class2
+      │   ├── img6.jpeg
+      │   └── ...
+      └── ...
+ 
+  ```
+- To boost the slow speed when reading images from massive small files, we also support zipped ImageNet, which includes
+  four files:
+    - `train.zip`, `val.zip`: which store the zipped folder for train and validate splits.
+    - `train_map.txt`, `val_map.txt`: which store the relative path in the corresponding zip file and ground truth
+      label. Make sure the data folder looks like this:
+
+  ```bash
+  $ tree data
+  data
+  └── ImageNet-Zip
+      ├── train_map.txt
+      ├── train.zip
+      ├── val_map.txt
+      └── val.zip
+  
+  $ head -n 5 data/ImageNet-Zip/val_map.txt
+  ILSVRC2012_val_00000001.JPEG	65
+  ILSVRC2012_val_00000002.JPEG	970
+  ILSVRC2012_val_00000003.JPEG	230
+  ILSVRC2012_val_00000004.JPEG	809
+  ILSVRC2012_val_00000005.JPEG	516
+  
+  $ head -n 5 data/ImageNet-Zip/train_map.txt
+  n01440764/n01440764_10026.JPEG	0
+  n01440764/n01440764_10027.JPEG	0
+  n01440764/n01440764_10029.JPEG	0
+  n01440764/n01440764_10040.JPEG	0
+  n01440764/n01440764_10042.JPEG	0
+  ```
+
+### Self-Supervised Pre-training
+
+To train `MoBY` with `Swin Transformer` on ImageNet, run:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node <num-of-gpus-to-use> --master_port 12345 moby_main.py \ 
+--cfg <config-file> --data-path <imagenet-path> [--batch-size <batch-size-per-gpu> --output <output-directory> --tag <job-tag>]
+```
+
+- Recommand using `--output` and `--tag` to tidy your experiments.
+
+**Notes**:
+
+- To use zipped ImageNet instead of folder dataset, add `--zip` to the parameters.
+    - To cache the dataset in the memory instead of reading from files every time, add `--cache-mode part`, which will
+      shard the dataset into non-overlapping pieces for different GPUs and only load the corresponding one for each GPU.
+- When GPU memory is not enough, you can try the following suggestions:
+    - Use gradient accumulation by adding `--accumulation-steps <steps>`, set appropriate `<steps>` according to your need.
+    - Use gradient checkpointing by adding `--use-checkpoint`, e.g., it saves about 60% memory when training `Swin-B`.
+      Please refer to [this page](https://pytorch.org/docs/stable/checkpoint.html) for more details.
+    - We recommend using multi-node with more GPUs for training very large models, a tutorial can be found
+      in [this page](https://pytorch.org/tutorials/intermediate/dist_tuto.html).
+- To change config options in general, you can use `--opts KEY1 VALUE1 KEY2 VALUE2`, e.g.,
+  `--opts TRAIN.EPOCHS 100 TRAIN.WARMUP_EPOCHS 5` will change total epochs to 100 and warm-up epochs to 5.
+- For additional options, see [config](config.py) and run `python moby_main.py --help` to get detailed message.
+
+For example, to train `MoBY` with Vision Transformers with 8 GPU on a single node for 300 epochs, run:
+
+`MoBY Swin-T`:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node 8 --master_port 12345  moby_main.py \
+--cfg configs/moby_swin_tiny.yaml --data-path <imagenet-path> --batch-size 64
+```
+
+`MoBY DeiT-Small`
+
+```bash
+python -m torch.distributed.launch --nproc_per_node 8 --master_port 12345  moby_main.py \
+--cfg configs/moby_deit_small.yaml --data-path <imagenet-path> --batch-size 64
+```
+
+### Linear Evaluation
+
+To evaluate a pre-trained `MoBY` with `Swin Transformer` on ImageNet-1K linear evaluation, run:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node <num-of-gpus-to-use> --master_port 12345 moby_linear.py \
+--cfg <config-file> --data-path <imagenet-path>
+```
+**Notes**:
+
+- Make sure the `<config-file>`, `<output-directory>` and `<tag>` are the same as in the pre-training stage.
+- Note that some configurations are fixed in [`moby_linear.py`](moby_linear.py#L78) for simplicity.
+
+For example, to evaluate `MoBY Swin-T` with 8 GPU on a single node on ImageNet-1K linear evluation, run:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node 8 --master_port 12345  moby_linear.py \
+--cfg configs/moby_swin_tiny.yaml --data-path <imagenet-path> --batch-size 64
+```
+
+### Evaluate
+
+To evaluate a `MoBY` with `Swin Transformer` linear evaluation model on ImageNet-1K, run:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node <num-of-gpus-to-use> --master_port 12345 moby_linear.py \
+--cfg <config-file> --resume <checkpoint> --data-path <imagenet-path> --eval
+```
+
+For example, to evaluate the provided `MoBY Swin-T` linear evaluation model with a single GPU:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node 1 --master_port 12345 moby_linear.py \
+--cfg configs/moby_swin_tiny.yaml --resume moby_swin_t_300ep_linear.pth --data-path <imagenet-path> --eval
+```
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/logger.py b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/logger.py
new file mode 100644
index 0000000000..a066e55bad
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/logger.py
@@ -0,0 +1,41 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------
+
+import os
+import sys
+import logging
+import functools
+from termcolor import colored
+
+
+@functools.lru_cache()
+def create_logger(output_dir, dist_rank=0, name=''):
+    # create logger
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+
+    # create formatter
+    fmt = '[%(asctime)s %(name)s] (%(filename)s %(lineno)d): %(levelname)s %(message)s'
+    color_fmt = colored('[%(asctime)s %(name)s]', 'green') + \
+                colored('(%(filename)s %(lineno)d)', 'yellow') + ': %(levelname)s %(message)s'
+
+    # create console handlers for master process
+    if dist_rank == 0:
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(logging.DEBUG)
+        console_handler.setFormatter(
+            logging.Formatter(fmt=color_fmt, datefmt='%Y-%m-%d %H:%M:%S'))
+        logger.addHandler(console_handler)
+
+    # create file handlers
+    file_handler = logging.FileHandler(os.path.join(output_dir, f'log_rank{dist_rank}.txt'), mode='a')
+    file_handler.setLevel(logging.DEBUG)
+    file_handler.setFormatter(logging.Formatter(fmt=fmt, datefmt='%Y-%m-%d %H:%M:%S'))
+    logger.addHandler(file_handler)
+
+    return logger
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/lr_scheduler.py b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/lr_scheduler.py
new file mode 100644
index 0000000000..4d27289be0
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/lr_scheduler.py
@@ -0,0 +1,102 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------
+
+import torch
+from timm.scheduler.cosine_lr import CosineLRScheduler
+from timm.scheduler.step_lr import StepLRScheduler
+from timm.scheduler.scheduler import Scheduler
+
+
+def build_scheduler(config, optimizer, n_iter_per_epoch):
+    num_steps = int(config.TRAIN.EPOCHS * n_iter_per_epoch)
+    warmup_steps = int(config.TRAIN.WARMUP_EPOCHS * n_iter_per_epoch)
+    decay_steps = int(config.TRAIN.LR_SCHEDULER.DECAY_EPOCHS * n_iter_per_epoch)
+
+    lr_scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == 'cosine':
+        lr_scheduler = CosineLRScheduler(
+            optimizer,
+            t_initial=num_steps,
+            t_mul=1.,
+            lr_min=config.TRAIN.MIN_LR,
+            warmup_lr_init=config.TRAIN.WARMUP_LR,
+            warmup_t=warmup_steps,
+            cycle_limit=1,
+            t_in_epochs=False,
+        )
+    elif config.TRAIN.LR_SCHEDULER.NAME == 'linear':
+        lr_scheduler = LinearLRScheduler(
+            optimizer,
+            t_initial=num_steps,
+            lr_min_rate=0.01,
+            warmup_lr_init=config.TRAIN.WARMUP_LR,
+            warmup_t=warmup_steps,
+            t_in_epochs=False,
+        )
+    elif config.TRAIN.LR_SCHEDULER.NAME == 'step':
+        lr_scheduler = StepLRScheduler(
+            optimizer,
+            decay_t=decay_steps,
+            decay_rate=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+            warmup_lr_init=config.TRAIN.WARMUP_LR,
+            warmup_t=warmup_steps,
+            t_in_epochs=False,
+        )
+
+    return lr_scheduler
+
+
+class LinearLRScheduler(Scheduler):
+    def __init__(self,
+                 optimizer: torch.optim.Optimizer,
+                 t_initial: int,
+                 lr_min_rate: float,
+                 warmup_t=0,
+                 warmup_lr_init=0.,
+                 t_in_epochs=True,
+                 noise_range_t=None,
+                 noise_pct=0.67,
+                 noise_std=1.0,
+                 noise_seed=42,
+                 initialize=True,
+                 ) -> None:
+        super().__init__(
+            optimizer, param_group_field="lr",
+            noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed,
+            initialize=initialize)
+
+        self.t_initial = t_initial
+        self.lr_min_rate = lr_min_rate
+        self.warmup_t = warmup_t
+        self.warmup_lr_init = warmup_lr_init
+        self.t_in_epochs = t_in_epochs
+        if self.warmup_t:
+            self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values]
+            super().update_groups(self.warmup_lr_init)
+        else:
+            self.warmup_steps = [1 for _ in self.base_values]
+
+    def _get_lr(self, t):
+        if t < self.warmup_t:
+            lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps]
+        else:
+            t = t - self.warmup_t
+            total_t = self.t_initial - self.warmup_t
+            lrs = [v - ((v - v * self.lr_min_rate) * (t / total_t)) for v in self.base_values]
+        return lrs
+
+    def get_epoch_values(self, epoch: int):
+        if self.t_in_epochs:
+            return self._get_lr(epoch)
+        else:
+            return None
+
+    def get_update_values(self, num_updates: int):
+        if not self.t_in_epochs:
+            return self._get_lr(num_updates)
+        else:
+            return None
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/main.py b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/main.py
new file mode 100644
index 0000000000..18e9ad67d6
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/main.py
@@ -0,0 +1,347 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------
+
+import os
+import time
+import argparse
+import datetime
+import numpy as np
+
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
+from timm.utils import accuracy, AverageMeter
+
+from config import get_config
+from models import build_model
+from data import build_loader
+from lr_scheduler import build_scheduler
+from optimizer import build_optimizer
+from logger import create_logger
+from utils import load_checkpoint, save_checkpoint, get_grad_norm, auto_resume_helper, reduce_tensor
+
+try:
+    # noinspection PyUnresolvedReferences
+    from apex import amp
+except ImportError:
+    amp = None
+
+
+def parse_option():
+    parser = argparse.ArgumentParser('Swin Transformer training and evaluation script', add_help=False)
+    parser.add_argument('--cfg', type=str, required=True, metavar="FILE", help='path to config file', )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options by adding 'KEY VALUE' pairs. ",
+        default=None,
+        nargs='+',
+    )
+
+    # easy config modification
+    parser.add_argument('--batch-size', type=int, help="batch size for single GPU")
+    parser.add_argument('--data-path', type=str, help='path to dataset')
+    parser.add_argument('--zip', action='store_true', help='use zipped dataset instead of folder dataset')
+    parser.add_argument('--cache-mode', type=str, default='part', choices=['no', 'full', 'part'],
+                        help='no: no cache, '
+                             'full: cache all data, '
+                             'part: sharding the dataset into nonoverlapping pieces and only cache one piece')
+    parser.add_argument('--resume', help='resume from checkpoint')
+    parser.add_argument('--accumulation-steps', type=int, help="gradient accumulation steps")
+    parser.add_argument('--use-checkpoint', action='store_true',
+                        help="whether to use gradient checkpointing to save memory")
+    parser.add_argument('--amp-opt-level', type=str, default='O1', choices=['O0', 'O1', 'O2'],
+                        help='mixed precision opt level, if O0, no amp is used')
+    parser.add_argument('--output', default='output', type=str, metavar='PATH',
+                        help='root of output folder, the full path is <output>/<model_name>/<tag> (default: output)')
+    parser.add_argument('--tag', help='tag of experiment')
+    parser.add_argument('--eval', action='store_true', help='Perform evaluation only')
+    parser.add_argument('--throughput', action='store_true', help='Test throughput only')
+
+    # distributed training
+    parser.add_argument("--local_rank", type=int, required=True, help='local rank for DistributedDataParallel')
+
+    args, unparsed = parser.parse_known_args()
+
+    config = get_config(args)
+
+    return args, config
+
+
+def main(config):
+    dataset_train, dataset_val, data_loader_train, data_loader_val, mixup_fn = build_loader(config)
+
+    logger.info(f"Creating model:{config.MODEL.TYPE}/{config.MODEL.NAME}")
+    model = build_model(config)
+    model.cuda()
+    logger.info(str(model))
+
+    optimizer = build_optimizer(config, model)
+    if config.AMP_OPT_LEVEL != "O0":
+        model, optimizer = amp.initialize(model, optimizer, opt_level=config.AMP_OPT_LEVEL)
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.LOCAL_RANK], broadcast_buffers=False)
+    model_without_ddp = model.module
+
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    logger.info(f"number of params: {n_parameters}")
+    if hasattr(model_without_ddp, 'flops'):
+        flops = model_without_ddp.flops()
+        logger.info(f"number of GFLOPs: {flops / 1e9}")
+
+    lr_scheduler = build_scheduler(config, optimizer, len(data_loader_train))
+
+    if config.AUG.MIXUP > 0.:
+        # smoothing is handled with mixup label transform
+        criterion = SoftTargetCrossEntropy()
+    elif config.MODEL.LABEL_SMOOTHING > 0.:
+        criterion = LabelSmoothingCrossEntropy(smoothing=config.MODEL.LABEL_SMOOTHING)
+    else:
+        criterion = torch.nn.CrossEntropyLoss()
+
+    max_accuracy = 0.0
+
+    if config.TRAIN.AUTO_RESUME:
+        resume_file = auto_resume_helper(config.OUTPUT)
+        if resume_file:
+            if config.MODEL.RESUME:
+                logger.warning(f"auto-resume changing resume file from {config.MODEL.RESUME} to {resume_file}")
+            config.defrost()
+            config.MODEL.RESUME = resume_file
+            config.freeze()
+            logger.info(f'auto resuming from {resume_file}')
+        else:
+            logger.info(f'no checkpoint found in {config.OUTPUT}, ignoring auto resume')
+
+    if config.MODEL.RESUME:
+        max_accuracy = load_checkpoint(config, model_without_ddp, optimizer, lr_scheduler, logger)
+        acc1, acc5, loss = validate(config, data_loader_val, model)
+        logger.info(f"Accuracy of the network on the {len(dataset_val)} test images: {acc1:.1f}%")
+        if config.EVAL_MODE:
+            return
+
+    if config.THROUGHPUT_MODE:
+        throughput(data_loader_val, model, logger)
+        return
+
+    logger.info("Start training")
+    start_time = time.time()
+    for epoch in range(config.TRAIN.START_EPOCH, config.TRAIN.EPOCHS):
+        data_loader_train.sampler.set_epoch(epoch)
+
+        train_one_epoch(config, model, criterion, data_loader_train, optimizer, epoch, mixup_fn, lr_scheduler)
+        if dist.get_rank() == 0 and (epoch % config.SAVE_FREQ == 0 or epoch == (config.TRAIN.EPOCHS - 1)):
+            save_checkpoint(config, epoch, model_without_ddp, max_accuracy, optimizer, lr_scheduler, logger)
+
+        acc1, acc5, loss = validate(config, data_loader_val, model)
+        logger.info(f"Accuracy of the network on the {len(dataset_val)} test images: {acc1:.1f}%")
+        max_accuracy = max(max_accuracy, acc1)
+        logger.info(f'Max accuracy: {max_accuracy:.2f}%')
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    logger.info('Training time {}'.format(total_time_str))
+
+
+def train_one_epoch(config, model, criterion, data_loader, optimizer, epoch, mixup_fn, lr_scheduler):
+    model.train()
+    optimizer.zero_grad()
+
+    num_steps = len(data_loader)
+    batch_time = AverageMeter()
+    loss_meter = AverageMeter()
+    norm_meter = AverageMeter()
+
+    start = time.time()
+    end = time.time()
+    for idx, (samples, targets) in enumerate(data_loader):
+        samples = samples.cuda(non_blocking=True)
+        targets = targets.cuda(non_blocking=True)
+
+        if mixup_fn is not None:
+            samples, targets = mixup_fn(samples, targets)
+
+        outputs = model(samples)
+
+        if config.TRAIN.ACCUMULATION_STEPS > 1:
+            loss = criterion(outputs, targets)
+            loss = loss / config.TRAIN.ACCUMULATION_STEPS
+            if config.AMP_OPT_LEVEL != "O0":
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                if config.TRAIN.CLIP_GRAD:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.TRAIN.CLIP_GRAD)
+                else:
+                    grad_norm = get_grad_norm(amp.master_params(optimizer))
+            else:
+                loss.backward()
+                if config.TRAIN.CLIP_GRAD:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.TRAIN.CLIP_GRAD)
+                else:
+                    grad_norm = get_grad_norm(model.parameters())
+            if (idx + 1) % config.TRAIN.ACCUMULATION_STEPS == 0:
+                optimizer.step()
+                optimizer.zero_grad()
+                lr_scheduler.step_update(epoch * num_steps + idx)
+        else:
+            loss = criterion(outputs, targets)
+            optimizer.zero_grad()
+            if config.AMP_OPT_LEVEL != "O0":
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                if config.TRAIN.CLIP_GRAD:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.TRAIN.CLIP_GRAD)
+                else:
+                    grad_norm = get_grad_norm(amp.master_params(optimizer))
+            else:
+                loss.backward()
+                if config.TRAIN.CLIP_GRAD:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.TRAIN.CLIP_GRAD)
+                else:
+                    grad_norm = get_grad_norm(model.parameters())
+            optimizer.step()
+            lr_scheduler.step_update(epoch * num_steps + idx)
+
+        torch.cuda.synchronize()
+
+        loss_meter.update(loss.item(), targets.size(0))
+        norm_meter.update(grad_norm)
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if idx % config.PRINT_FREQ == 0:
+            lr = optimizer.param_groups[0]['lr']
+            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
+            etas = batch_time.avg * (num_steps - idx)
+            logger.info(
+                f'Train: [{epoch}/{config.TRAIN.EPOCHS}][{idx}/{num_steps}]\t'
+                f'eta {datetime.timedelta(seconds=int(etas))} lr {lr:.6f}\t'
+                f'time {batch_time.val:.4f} ({batch_time.avg:.4f})\t'
+                f'loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
+                f'grad_norm {norm_meter.val:.4f} ({norm_meter.avg:.4f})\t'
+                f'mem {memory_used:.0f}MB')
+    epoch_time = time.time() - start
+    logger.info(f"EPOCH {epoch} training takes {datetime.timedelta(seconds=int(epoch_time))}")
+
+
+@torch.no_grad()
+def validate(config, data_loader, model):
+    criterion = torch.nn.CrossEntropyLoss()
+    model.eval()
+
+    batch_time = AverageMeter()
+    loss_meter = AverageMeter()
+    acc1_meter = AverageMeter()
+    acc5_meter = AverageMeter()
+
+    end = time.time()
+    for idx, (images, target) in enumerate(data_loader):
+        images = images.cuda(non_blocking=True)
+        target = target.cuda(non_blocking=True)
+
+        # compute output
+        output = model(images)
+
+        # measure accuracy and record loss
+        loss = criterion(output, target)
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+
+        acc1 = reduce_tensor(acc1)
+        acc5 = reduce_tensor(acc5)
+        loss = reduce_tensor(loss)
+
+        loss_meter.update(loss.item(), target.size(0))
+        acc1_meter.update(acc1.item(), target.size(0))
+        acc5_meter.update(acc5.item(), target.size(0))
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if idx % config.PRINT_FREQ == 0:
+            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
+            logger.info(
+                f'Test: [{idx}/{len(data_loader)}]\t'
+                f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                f'Loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
+                f'Acc@1 {acc1_meter.val:.3f} ({acc1_meter.avg:.3f})\t'
+                f'Acc@5 {acc5_meter.val:.3f} ({acc5_meter.avg:.3f})\t'
+                f'Mem {memory_used:.0f}MB')
+    logger.info(f' * Acc@1 {acc1_meter.avg:.3f} Acc@5 {acc5_meter.avg:.3f}')
+    return acc1_meter.avg, acc5_meter.avg, loss_meter.avg
+
+
+@torch.no_grad()
+def throughput(data_loader, model, logger):
+    model.eval()
+
+    for idx, (images, _) in enumerate(data_loader):
+        images = images.cuda(non_blocking=True)
+        batch_size = images.shape[0]
+        for i in range(50):
+            model(images)
+        torch.cuda.synchronize()
+        logger.info(f"throughput averaged with 30 times")
+        tic1 = time.time()
+        for i in range(30):
+            model(images)
+        torch.cuda.synchronize()
+        tic2 = time.time()
+        logger.info(f"batch_size {batch_size} throughput {30 * batch_size / (tic2 - tic1)}")
+        return
+
+
+if __name__ == '__main__':
+    _, config = parse_option()
+
+    if config.AMP_OPT_LEVEL != "O0":
+        assert amp is not None, "amp not installed!"
+
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        rank = int(os.environ["RANK"])
+        world_size = int(os.environ['WORLD_SIZE'])
+        print(f"RANK and WORLD_SIZE in environ: {rank}/{world_size}")
+    else:
+        rank = -1
+        world_size = -1
+    torch.cuda.set_device(config.LOCAL_RANK)
+    torch.distributed.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank)
+    torch.distributed.barrier()
+
+    seed = config.SEED + dist.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    cudnn.benchmark = True
+
+    # linear scale the learning rate according to total batch size, may not be optimal
+    linear_scaled_lr = config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0
+    linear_scaled_warmup_lr = config.TRAIN.WARMUP_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0
+    linear_scaled_min_lr = config.TRAIN.MIN_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0
+    # gradient accumulation also need to scale the learning rate
+    if config.TRAIN.ACCUMULATION_STEPS > 1:
+        linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUMULATION_STEPS
+        linear_scaled_warmup_lr = linear_scaled_warmup_lr * config.TRAIN.ACCUMULATION_STEPS
+        linear_scaled_min_lr = linear_scaled_min_lr * config.TRAIN.ACCUMULATION_STEPS
+    config.defrost()
+    config.TRAIN.BASE_LR = linear_scaled_lr
+    config.TRAIN.WARMUP_LR = linear_scaled_warmup_lr
+    config.TRAIN.MIN_LR = linear_scaled_min_lr
+    config.freeze()
+
+    os.makedirs(config.OUTPUT, exist_ok=True)
+    logger = create_logger(output_dir=config.OUTPUT, dist_rank=dist.get_rank(), name=f"{config.MODEL.NAME}")
+
+    if dist.get_rank() == 0:
+        path = os.path.join(config.OUTPUT, "config.json")
+        with open(path, "w") as f:
+            f.write(config.dump())
+        logger.info(f"Full config saved to {path}")
+
+    # print config
+    logger.info(config.dump())
+
+    main(config)
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/moby_linear.py b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/moby_linear.py
new file mode 100644
index 0000000000..3ce7b44448
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/moby_linear.py
@@ -0,0 +1,386 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# Modified by Zhenda Xie
+# --------------------------------------------------------
+
+import os
+import time
+import argparse
+import datetime
+import numpy as np
+
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
+from timm.utils import accuracy, AverageMeter
+
+from config import get_config
+from models import build_model
+from data import build_loader
+from lr_scheduler import build_scheduler
+from optimizer import build_optimizer
+from logger import create_logger
+from utils import load_pretrained, load_checkpoint, save_checkpoint, get_grad_norm, auto_resume_helper, reduce_tensor
+
+try:
+    # noinspection PyUnresolvedReferences
+    from apex import amp
+except ImportError:
+    amp = None
+
+
+def parse_option():
+    parser = argparse.ArgumentParser('Swin Transformer training and evaluation script', add_help=False)
+    parser.add_argument('--cfg', type=str, required=True, metavar="FILE", help='path to config file', )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options by adding 'KEY VALUE' pairs. ",
+        default=None,
+        nargs='+',
+    )
+
+    # easy config modification
+    parser.add_argument('--batch-size', type=int, help="batch size for single GPU")
+    parser.add_argument('--data-path', type=str, help='path to dataset')
+    parser.add_argument('--zip', action='store_true', help='use zipped dataset instead of folder dataset')
+    parser.add_argument('--cache-mode', type=str, default='part', choices=['no', 'full', 'part'],
+                        help='no: no cache, '
+                             'full: cache all data, '
+                             'part: sharding the dataset into nonoverlapping pieces and only cache one piece')
+    parser.add_argument('--resume', help='resume from checkpoint')
+    parser.add_argument('--accumulation-steps', type=int, help="gradient accumulation steps")
+    parser.add_argument('--use-checkpoint', action='store_true',
+                        help="whether to use gradient checkpointing to save memory")
+    parser.add_argument('--amp-opt-level', type=str, default='O1', choices=['O0', 'O1', 'O2'],
+                        help='mixed precision opt level, if O0, no amp is used')
+    parser.add_argument('--output', default='output', type=str, metavar='PATH',
+                        help='root of output folder, the full path is <output>/<model_name>/<tag> (default: output)')
+    parser.add_argument('--tag', help='tag of experiment')
+    parser.add_argument('--eval', action='store_true', help='Perform evaluation only')
+    parser.add_argument('--throughput', action='store_true', help='Test throughput only')
+
+    # distributed training
+    parser.add_argument("--local_rank", type=int, required=True, help='local rank for DistributedDataParallel')
+    
+    # dev: linear eval settings
+    parser.add_argument('--lr', type=float, default=1.0, help='the base lr for linear evaluation')
+    parser.add_argument('--drop-path-rate', type=float, default=0.2, help='the drop path rate used in linear evaluation')
+    parser.add_argument('--epochs', type=int, help="training epochs")
+
+    args, unparsed = parser.parse_known_args()
+
+    config = get_config(args)
+
+    config.defrost()
+    # base
+    config.LINEAR_EVAL.PRETRAINED = os.path.join(config.OUTPUT, 'checkpoint.pth')
+    config.OUTPUT = os.path.join(config.OUTPUT, 'linear')
+    # model
+    config.MODEL.TYPE = 'linear'
+    config.MODEL.DROP_PATH_RATE = args.drop_path_rate
+    # aug
+    config.AUG.SSL_AUG = False
+    config.AUG.SSL_LINEAR_AUG = True
+    config.AUG.MIXUP = 0.0
+    config.AUG.CUTMIX = 0.0
+    config.AUG.CUTMIX_MINMAX = None
+    # train
+    config.TRAIN.EPOCHS = 100
+    config.TRAIN.WARMUP_EPOCHS = 5
+    # sched
+    config.TRAIN.LR_SCHEDULER.NAME = 'cosine'
+    # optim
+    config.TRAIN.OPTIMIZER.NAME = 'sgd'
+    config.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+    config.TRAIN.BASE_LR = args.lr
+    config.TRAIN.WEIGHT_DECAY = 0.0
+    config.freeze()
+
+    return args, config
+
+
+def main(config):    
+    _, dataset_val, data_loader_train, data_loader_val, mixup_fn = build_loader(config)
+
+    logger.info(f"Creating model:{config.MODEL.TYPE}/{config.MODEL.NAME}")
+    model = build_model(config)
+    model.npu()
+    logger.info(str(model))
+    
+    # fix parameters except head
+    for name, p in model.named_parameters():
+        if 'head' not in name:
+            p.requires_grad = False
+
+    optimizer = build_optimizer(config, model)
+    if config.AMP_OPT_LEVEL != "O0":
+        model, optimizer = amp.initialize(model, optimizer, opt_level=config.AMP_OPT_LEVEL)
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.LOCAL_RANK], broadcast_buffers=False)
+    model_without_ddp = model.module
+    
+    # load self-supervised pre-trained model
+    load_pretrained(model_without_ddp, config.LINEAR_EVAL.PRETRAINED, logger)
+
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    logger.info(f"number of params: {n_parameters}")
+    if hasattr(model_without_ddp, 'flops'):
+        flops = model_without_ddp.flops()
+        logger.info(f"number of GFLOPs: {flops / 1e9}")
+
+    lr_scheduler = build_scheduler(config, optimizer, len(data_loader_train))
+
+    if config.AUG.MIXUP > 0.:
+        # smoothing is handled with mixup label transform
+        criterion = SoftTargetCrossEntropy()
+    elif config.MODEL.LABEL_SMOOTHING > 0.:
+        criterion = LabelSmoothingCrossEntropy(smoothing=config.MODEL.LABEL_SMOOTHING)
+    else:
+        criterion = torch.nn.CrossEntropyLoss()
+
+    max_accuracy = 0.0
+
+    if config.TRAIN.AUTO_RESUME:
+        resume_file = auto_resume_helper(config.OUTPUT)
+        if resume_file:
+            if config.MODEL.RESUME:
+                logger.warning(f"auto-resume changing resume file from {config.MODEL.RESUME} to {resume_file}")
+            config.defrost()
+            config.MODEL.RESUME = resume_file
+            config.freeze()
+            logger.info(f'auto resuming from {resume_file}')
+        else:
+            logger.info(f'no checkpoint found in {config.OUTPUT}, ignoring auto resume')
+
+    if config.MODEL.RESUME:
+        max_accuracy = load_checkpoint(config, model_without_ddp, optimizer, lr_scheduler, logger)
+        acc1, acc5, loss = validate(config, data_loader_val, model)
+        logger.info(f"Accuracy of the network on the {len(dataset_val)} test images: {acc1:.1f}%")
+        if config.EVAL_MODE:
+            return
+
+    if config.THROUGHPUT_MODE:
+        throughput(data_loader_val, model, logger)
+        return
+
+    logger.info("Start linear evaluation training")
+    start_time = time.time()
+    for epoch in range(config.TRAIN.START_EPOCH, config.TRAIN.EPOCHS):
+        data_loader_train.sampler.set_epoch(epoch)
+
+        train_one_epoch(config, model, criterion, data_loader_train, optimizer, epoch, mixup_fn, lr_scheduler)
+        if dist.get_rank() == 0 and (epoch % config.SAVE_FREQ == 0 or epoch == (config.TRAIN.EPOCHS - 1)):
+            save_checkpoint(config, epoch, model_without_ddp, max_accuracy, optimizer, lr_scheduler, logger)
+
+        acc1, acc5, loss = validate(config, data_loader_val, model)
+        logger.info(f"Accuracy of the network on the {len(dataset_val)} test images: {acc1:.1f}%")
+        max_accuracy = max(max_accuracy, acc1)
+        logger.info(f'Max accuracy: {max_accuracy:.2f}%')
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    logger.info('Training time {}'.format(total_time_str))
+
+
+def train_one_epoch(config, model, criterion, data_loader, optimizer, epoch, mixup_fn, lr_scheduler):
+    model.train()
+    optimizer.zero_grad()
+
+    num_steps = len(data_loader)
+    batch_time = AverageMeter()
+    loss_meter = AverageMeter()
+    norm_meter = AverageMeter()
+
+    start = time.time()
+    end = time.time()
+    for idx, (samples, targets) in enumerate(data_loader):
+        samples = samples.npu(non_blocking=True)
+        targets = targets.npu(non_blocking=True)
+
+        if mixup_fn is not None:
+            samples, targets = mixup_fn(samples, targets)
+
+        outputs = model(samples)
+
+        if config.TRAIN.ACCUMULATION_STEPS > 1:
+            loss = criterion(outputs, targets)
+            loss = loss / config.TRAIN.ACCUMULATION_STEPS
+            if config.AMP_OPT_LEVEL != "O0":
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                if config.TRAIN.CLIP_GRAD:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.TRAIN.CLIP_GRAD)
+                else:
+                    grad_norm = get_grad_norm(amp.master_params(optimizer))
+            else:
+                loss.backward()
+                if config.TRAIN.CLIP_GRAD:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.TRAIN.CLIP_GRAD)
+                else:
+                    grad_norm = get_grad_norm(model.parameters())
+            if (idx + 1) % config.TRAIN.ACCUMULATION_STEPS == 0:
+                optimizer.step()
+                optimizer.zero_grad()
+                lr_scheduler.step_update(epoch * num_steps + idx)
+        else:
+            loss = criterion(outputs, targets)
+            optimizer.zero_grad()
+            if config.AMP_OPT_LEVEL != "O0":
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                if config.TRAIN.CLIP_GRAD:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.TRAIN.CLIP_GRAD)
+                else:
+                    grad_norm = get_grad_norm(amp.master_params(optimizer))
+            else:
+                loss.backward()
+                if config.TRAIN.CLIP_GRAD:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.TRAIN.CLIP_GRAD)
+                else:
+                    grad_norm = get_grad_norm(model.parameters())
+            optimizer.step()
+            lr_scheduler.step_update(epoch * num_steps + idx)
+
+        torch.npu.synchronize()
+
+        loss_meter.update(loss.item(), targets.size(0))
+        norm_meter.update(grad_norm)
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if idx % config.PRINT_FREQ == 0:
+            lr = optimizer.param_groups[0]['lr']
+            memory_used = torch.npu.max_memory_allocated() / (1024.0 * 1024.0)
+            etas = batch_time.avg * (num_steps - idx)
+            logger.info(
+                f'Train: [{epoch}/{config.TRAIN.EPOCHS}][{idx}/{num_steps}]\t'
+                f'eta {datetime.timedelta(seconds=int(etas))} lr {lr:.6f}\t'
+                f'time {batch_time.val:.4f} ({batch_time.avg:.4f})\t'
+                f'loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
+                f'grad_norm {norm_meter.val:.4f} ({norm_meter.avg:.4f})\t'
+                f'mem {memory_used:.0f}MB')
+    epoch_time = time.time() - start
+    logger.info(f"EPOCH {epoch} training takes {datetime.timedelta(seconds=int(epoch_time))}")
+
+
+@torch.no_grad()
+def validate(config, data_loader, model):
+    criterion = torch.nn.CrossEntropyLoss()
+    model.eval()
+
+    batch_time = AverageMeter()
+    loss_meter = AverageMeter()
+    acc1_meter = AverageMeter()
+    acc5_meter = AverageMeter()
+
+    end = time.time()
+    for idx, (images, target) in enumerate(data_loader):
+        images = images.npu(non_blocking=True)
+        target = target.npu(non_blocking=True)
+
+        # compute output
+        output = model(images)
+
+        # measure accuracy and record loss
+        loss = criterion(output, target)
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+
+        acc1 = reduce_tensor(acc1)
+        acc5 = reduce_tensor(acc5)
+        loss = reduce_tensor(loss)
+
+        loss_meter.update(loss.item(), target.size(0))
+        acc1_meter.update(acc1.item(), target.size(0))
+        acc5_meter.update(acc5.item(), target.size(0))
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if idx % config.PRINT_FREQ == 0:
+            memory_used = torch.npu.max_memory_allocated() / (1024.0 * 1024.0)
+            logger.info(
+                f'Test: [{idx}/{len(data_loader)}]\t'
+                f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                f'Loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
+                f'Acc@1 {acc1_meter.val:.3f} ({acc1_meter.avg:.3f})\t'
+                f'Acc@5 {acc5_meter.val:.3f} ({acc5_meter.avg:.3f})\t'
+                f'Mem {memory_used:.0f}MB')
+    logger.info(f' * Acc@1 {acc1_meter.avg:.3f} Acc@5 {acc5_meter.avg:.3f}')
+    return acc1_meter.avg, acc5_meter.avg, loss_meter.avg
+
+
+@torch.no_grad()
+def throughput(data_loader, model, logger):
+    model.eval()
+
+    for idx, (images, _) in enumerate(data_loader):
+        images = images.npu(non_blocking=True)
+        batch_size = images.shape[0]
+        for i in range(50):
+            model(images)
+        torch.npu.synchronize()
+        logger.info(f"throughput averaged with 30 times")
+        tic1 = time.time()
+        for i in range(30):
+            model(images)
+        torch.npu.synchronize()
+        tic2 = time.time()
+        logger.info(f"batch_size {batch_size} throughput {30 * batch_size / (tic2 - tic1)}")
+        return
+
+
+if __name__ == '__main__':
+    _, config = parse_option()
+
+    if config.AMP_OPT_LEVEL != "O0":
+        assert amp is not None, "amp not installed!"
+
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        rank = int(os.environ["RANK"])
+        world_size = int(os.environ['WORLD_SIZE'])
+        print(f"RANK and WORLD_SIZE in environ: {rank}/{world_size}")
+    else:
+        rank = -1
+        world_size = -1
+    torch.npu.set_device(config.LOCAL_RANK)
+    torch.distributed.init_process_group(backend='hccl', init_method='env://', world_size=world_size, rank=rank)
+    torch.distributed.barrier()
+
+    seed = config.SEED + dist.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    cudnn.benchmark = True
+
+    # linear scale the learning rate according to total batch size, may not be optimal
+    linear_scaled_lr = config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0
+    linear_scaled_warmup_lr = config.TRAIN.WARMUP_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0
+    linear_scaled_min_lr = config.TRAIN.MIN_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0
+    # gradient accumulation also need to scale the learning rate
+    if config.TRAIN.ACCUMULATION_STEPS > 1:
+        linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUMULATION_STEPS
+        linear_scaled_warmup_lr = linear_scaled_warmup_lr * config.TRAIN.ACCUMULATION_STEPS
+        linear_scaled_min_lr = linear_scaled_min_lr * config.TRAIN.ACCUMULATION_STEPS
+    config.defrost()
+    config.TRAIN.BASE_LR = linear_scaled_lr
+    config.TRAIN.WARMUP_LR = linear_scaled_warmup_lr
+    config.TRAIN.MIN_LR = linear_scaled_min_lr
+    config.freeze()
+
+    os.makedirs(config.OUTPUT, exist_ok=True)
+    logger = create_logger(output_dir=config.OUTPUT, dist_rank=dist.get_rank(), name=f"{config.MODEL.NAME}")
+
+    if dist.get_rank() == 0:
+        path = os.path.join(config.OUTPUT, "config.json")
+        with open(path, "w") as f:
+            f.write(config.dump())
+        logger.info(f"Full config saved to {path}")
+
+    # print config
+    logger.info(config.dump())
+
+    main(config)
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/moby_main.py b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/moby_main.py
new file mode 100644
index 0000000000..81e58ee9e5
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/moby_main.py
@@ -0,0 +1,283 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# Modified by Zhenda Xie
+# --------------------------------------------------------
+
+import os
+import time
+import argparse
+import datetime
+import numpy as np
+
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+from timm.utils import AverageMeter
+from torch._six import inf
+
+from config import get_config
+from models import build_model
+from data import build_loader
+from lr_scheduler import build_scheduler
+from optimizer import build_optimizer
+from logger import create_logger
+from utils import load_checkpoint, save_checkpoint, get_grad_norm, auto_resume_helper, reduce_tensor
+
+try:
+    # noinspection PyUnresolvedReferences
+    from apex import amp
+    amp.register_half_function(torch.nn.functional, 'softmax')
+    amp.register_half_function(torch.nn.functional, 'layer_norm')
+    amp.register_half_function(torch, 'fast_gelu')
+except ImportError:
+    amp = None
+
+
+def clip_grad_norm_(parameters, max_norm, optimizer, norm_type=2):
+    # return torch.nn.utils.clip_grad_norm_(parameters, max_norm)
+    torch.npu.synchronize()
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    combine_grads = optimizer.get_optimizer_combined_grads()[0]
+    if norm_type == inf:
+        parameters = list(filter(lambda p: p.grad is not None, parameters))
+        total_norm = max(p.grad.detach().abs().max() for p in parameters)
+    else:
+        torch.npu.synchronize()
+        total_norm = torch.norm(combine_grads.detach(), norm_type)
+        torch.npu.synchronize()
+    clip_coef = max_norm / (total_norm + 1e-6)
+    if clip_coef < 1:
+        combine_grads.detach().mul_(clip_coef)
+    return total_norm
+
+
+
+def parse_option():
+    parser = argparse.ArgumentParser('MoBY training and evaluation script', add_help=False)
+    parser.add_argument('--cfg', type=str, required=True, metavar="FILE", help='path to config file', )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options by adding 'KEY VALUE' pairs. ",
+        default=None,
+        nargs='+',
+    )
+
+    # easy config modification
+    parser.add_argument('--batch-size', type=int, help="batch size for single GPU")
+    parser.add_argument('--data-path', type=str, help='path to dataset')
+    parser.add_argument('--zip', action='store_true', help='use zipped dataset instead of folder dataset')
+    parser.add_argument('--cache-mode', type=str, default='part', choices=['no', 'full', 'part'],
+                        help='no: no cache, '
+                             'full: cache all data, '
+                             'part: sharding the dataset into nonoverlapping pieces and only cache one piece')
+    parser.add_argument('--resume', help='resume from checkpoint')
+    parser.add_argument('--accumulation-steps', type=int, help="gradient accumulation steps")
+    parser.add_argument('--use-checkpoint', action='store_true',
+                        help="whether to use gradient checkpointing to save memory")
+    parser.add_argument('--amp-opt-level', type=str, default='O1', choices=['O0', 'O1', 'O2'],
+                        help='mixed precision opt level, if O0, no amp is used')
+    parser.add_argument('--output', default='output', type=str, metavar='PATH',
+                        help='root of output folder, the full path is <output>/<model_name>/<tag> (default: output)')
+    parser.add_argument('--tag', help='tag of experiment')
+    parser.add_argument('--eval', action='store_true', help='Perform evaluation only')
+    parser.add_argument('--throughput', action='store_true', help='Test throughput only')
+    parser.add_argument('--epochs', type=int, help="training epochs")
+    parser.add_argument('--steps', type=int, help="training steps")
+
+    # distributed training
+    parser.add_argument("--local_rank", type=int, required=True, help='local rank for DistributedDataParallel')
+
+    args, unparsed = parser.parse_known_args()
+
+    config = get_config(args)
+
+    return args, config
+
+
+def main(config, args):
+    dataset_train, _, data_loader_train, _, _ = build_loader(config)
+    
+    config.defrost()
+    config.DATA.TRAINING_IMAGES = len(dataset_train)
+    config.freeze()
+
+    logger.info(f"Creating model:{config.MODEL.TYPE}/{config.MODEL.NAME}")
+    model = build_model(config)
+    model.npu()
+    logger.info(str(model))
+
+    optimizer = build_optimizer(config, model)
+    if config.AMP_OPT_LEVEL != "O0":
+        try:
+            model, optimizer = amp.initialize(model, optimizer, opt_level=config.AMP_OPT_LEVEL, combine_grad=True, user_cast_preferred=True)
+        except:
+            print('user_cast_preferred not supported in current apex version. Update for high performance.')
+            model, optimizer = amp.initialize(model, optimizer, opt_level=config.AMP_OPT_LEVEL, combine_grad=True)
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.LOCAL_RANK], broadcast_buffers=False)
+    model_without_ddp = model.module
+
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    logger.info(f"number of params: {n_parameters}")
+    if hasattr(model_without_ddp, 'flops'):
+        flops = model_without_ddp.flops()
+        logger.info(f"number of GFLOPs: {flops / 1e9}")
+
+    lr_scheduler = build_scheduler(config, optimizer, len(data_loader_train))
+
+    if config.TRAIN.AUTO_RESUME:
+        resume_file = auto_resume_helper(config.OUTPUT)
+        if resume_file:
+            if config.MODEL.RESUME:
+                logger.warning(f"auto-resume changing resume file from {config.MODEL.RESUME} to {resume_file}")
+            config.defrost()
+            config.MODEL.RESUME = resume_file
+            config.freeze()
+            logger.info(f'auto resuming from {resume_file}')
+        else:
+            logger.info(f'no checkpoint found in {config.OUTPUT}, ignoring auto resume')
+
+    if config.MODEL.RESUME:
+        _ = load_checkpoint(config, model_without_ddp, optimizer, lr_scheduler, logger)
+
+    logger.info("Start self-supervised pre-training")
+    start_time = time.time()
+    for epoch in range(config.TRAIN.START_EPOCH, config.TRAIN.EPOCHS):
+        data_loader_train.sampler.set_epoch(epoch)
+
+        train_one_epoch(config, model, data_loader_train, optimizer, epoch, lr_scheduler, args)
+        if dist.get_rank() == 0 and (epoch % config.SAVE_FREQ == 0 or epoch == (config.TRAIN.EPOCHS - 1)):
+            save_checkpoint(config, epoch, model_without_ddp, 0.0, optimizer, lr_scheduler, logger)
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    logger.info('Training time {}'.format(total_time_str))
+
+
+def train_one_epoch(config, model, data_loader, optimizer, epoch, lr_scheduler, args):
+    model.train()
+    optimizer.zero_grad()
+
+    num_steps = len(data_loader)
+    data_time = AverageMeter()
+    batch_time = AverageMeter()
+    loss_meter = AverageMeter()
+    norm_meter = AverageMeter()
+
+    start = time.time()
+    end = time.time()
+    for idx, (samples_1, samples_2, targets) in enumerate(data_loader):
+        if idx == args.steps:
+            exit(0)
+        data_time.update(time.time() - end)
+        samples_1 = samples_1.npu(non_blocking=True)
+        samples_2 = samples_2.npu(non_blocking=True)
+        targets = targets.npu(non_blocking=True)
+
+        loss = model(samples_1, samples_2, optimizer)
+
+        optimizer.zero_grad()
+        if config.AMP_OPT_LEVEL != "O0":
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+            if config.TRAIN.CLIP_GRAD:
+                grad_norm = clip_grad_norm_(amp.master_params(optimizer), config.TRAIN.CLIP_GRAD, optimizer)
+            else:
+                grad_norm = get_grad_norm(amp.master_params(optimizer))
+        else:
+            loss.backward()
+            if config.TRAIN.CLIP_GRAD:
+                grad_norm = clip_grad_norm_(model.parameters(), config.TRAIN.CLIP_GRAD, optimizer)
+            else:
+                grad_norm = get_grad_norm(model.parameters())
+        optimizer.step()
+        lr_scheduler.step_update(epoch * num_steps + idx)
+
+        torch.npu.synchronize()
+
+        loss_meter.update(loss.item(), targets.size(0))
+        norm_meter.update(grad_norm)
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if idx % config.PRINT_FREQ == 0:
+            lr = optimizer.param_groups[0]['lr']
+            memory_used = torch.npu.max_memory_allocated() / (1024.0 * 1024.0)
+            etas = batch_time.avg * (num_steps - idx)
+            logger.info(
+                f'Train: [{epoch}/{config.TRAIN.EPOCHS}][{idx}/{num_steps}]\t'
+                f'eta {datetime.timedelta(seconds=int(etas))} lr {lr:.6f}\t'
+                f'time {batch_time.val:.4f} ({batch_time.avg:.4f})\t'
+                f'data {data_time.val:.4f} ({data_time.avg:.4f})\t'
+                f'loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
+                f'grad_norm {norm_meter.val:.4f} ({norm_meter.avg:.4f})\t'
+                f'mem {memory_used:.0f}MB')
+    epoch_time = time.time() - start
+    logger.info(f"EPOCH {epoch} training takes {datetime.timedelta(seconds=int(epoch_time))}")
+
+
+if __name__ == '__main__':
+    args, config = parse_option()
+
+    if config.AMP_OPT_LEVEL != "O0":
+        assert amp is not None, "amp not installed!"
+
+    option = {}
+
+    # enable high-perfomance-mode on aicore
+    option["ACL_OP_SELECT_IMPL_MODE"] = "high_performance"
+    option["ACL_OPTYPELIST_FOR_IMPLMODE"] = "LayerNorm"
+
+    torch.npu.set_option(option)
+
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        rank = int(os.environ["RANK"])
+        world_size = int(os.environ['WORLD_SIZE'])
+        print(f"RANK and WORLD_SIZE in environ: {rank}/{world_size}")
+    else:
+        rank = -1
+        world_size = -1
+    torch.npu.set_device(config.LOCAL_RANK)
+    os.environ['MASTER_ADDR'] = '127.0.0.1'
+    os.environ['MASTER_PORT'] = '29680'
+    torch.distributed.init_process_group(backend='hccl', init_method='env://', world_size=world_size, rank=rank)
+    torch.distributed.barrier()
+
+    seed = config.SEED + dist.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    cudnn.benchmark = True
+
+    # linear scale the learning rate according to total batch size, may not be optimal
+    linear_scaled_lr = config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0
+    linear_scaled_warmup_lr = config.TRAIN.WARMUP_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0
+    linear_scaled_min_lr = config.TRAIN.MIN_LR * config.DATA.BATCH_SIZE * dist.get_world_size() / 512.0
+    # gradient accumulation also need to scale the learning rate
+    if config.TRAIN.ACCUMULATION_STEPS > 1:
+        linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUMULATION_STEPS
+        linear_scaled_warmup_lr = linear_scaled_warmup_lr * config.TRAIN.ACCUMULATION_STEPS
+        linear_scaled_min_lr = linear_scaled_min_lr * config.TRAIN.ACCUMULATION_STEPS
+    config.defrost()
+    config.TRAIN.BASE_LR = linear_scaled_lr
+    config.TRAIN.WARMUP_LR = linear_scaled_warmup_lr
+    config.TRAIN.MIN_LR = linear_scaled_min_lr
+    config.freeze()
+
+    os.makedirs(config.OUTPUT, exist_ok=True)
+    logger = create_logger(output_dir=config.OUTPUT, dist_rank=dist.get_rank(), name=f"{config.MODEL.NAME}")
+
+    if dist.get_rank() == 0:
+        path = os.path.join(config.OUTPUT, "config.json")
+        with open(path, "w") as f:
+            f.write(config.dump())
+        logger.info(f"Full config saved to {path}")
+
+    # print config
+    logger.info(config.dump())
+
+    main(config, args)
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/models/__init__.py b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/models/__init__.py
new file mode 100644
index 0000000000..2d9c65e39f
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/models/__init__.py
@@ -0,0 +1 @@
+from .build import build_model
\ No newline at end of file
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/models/build.py b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/models/build.py
new file mode 100644
index 0000000000..1c054a4ef8
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/models/build.py
@@ -0,0 +1,75 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# Modified by Zhenda Xie
+# --------------------------------------------------------
+
+from functools import partial
+# from timm.models import vit_deit_small_patch16_224
+
+from .swin_transformer import SwinTransformer
+from .moby import MoBY
+
+# vit_models = dict(
+#     deit_small=vit_deit_small_patch16_224,
+# )
+
+
+def build_model(config):
+    model_type = config.MODEL.TYPE
+    encoder_type = config.MODEL.MOBY.ENCODER
+
+    if encoder_type == 'swin':
+        enc = partial(
+            SwinTransformer,
+            img_size=config.DATA.IMG_SIZE,
+            patch_size=config.MODEL.SWIN.PATCH_SIZE,
+            in_chans=config.MODEL.SWIN.IN_CHANS,
+            embed_dim=config.MODEL.SWIN.EMBED_DIM,
+            depths=config.MODEL.SWIN.DEPTHS,
+            num_heads=config.MODEL.SWIN.NUM_HEADS,
+            window_size=config.MODEL.SWIN.WINDOW_SIZE,
+            mlp_ratio=config.MODEL.SWIN.MLP_RATIO,
+            qkv_bias=config.MODEL.SWIN.QKV_BIAS,
+            qk_scale=config.MODEL.SWIN.QK_SCALE,
+            drop_rate=config.MODEL.DROP_RATE,
+            ape=config.MODEL.SWIN.APE,
+            patch_norm=config.MODEL.SWIN.PATCH_NORM,
+            use_checkpoint=config.TRAIN.USE_CHECKPOINT,
+            norm_befor_mlp=config.MODEL.SWIN.NORM_BEFORE_MLP,
+        )
+    elif encoder_type.startswith('vit') or encoder_type.startswith('deit'):
+        enc = vit_models[encoder_type]
+    else:
+        raise NotImplementedError(f'--> Unknown encoder_type: {encoder_type}')
+
+    if model_type == 'moby':
+        encoder = enc(
+            num_classes=0,
+            drop_path_rate=config.MODEL.MOBY.ONLINE_DROP_PATH_RATE,
+        )
+        encoder_k = enc(
+            num_classes=0,
+            drop_path_rate=config.MODEL.MOBY.TARGET_DROP_PATH_RATE,
+        )
+        model = MoBY(
+            cfg=config,
+            encoder=encoder,
+            encoder_k=encoder_k,
+            contrast_momentum=config.MODEL.MOBY.CONTRAST_MOMENTUM,
+            contrast_temperature=config.MODEL.MOBY.CONTRAST_TEMPERATURE,
+            contrast_num_negative=config.MODEL.MOBY.CONTRAST_NUM_NEGATIVE,
+            proj_num_layers=config.MODEL.MOBY.PROJ_NUM_LAYERS,
+            pred_num_layers=config.MODEL.MOBY.PRED_NUM_LAYERS,
+        )
+    elif model_type == 'linear':
+        model = enc(
+            num_classes=config.MODEL.NUM_CLASSES,
+            drop_path_rate=config.MODEL.DROP_PATH_RATE,
+        )
+    else:
+        raise NotImplementedError(f'--> Unknown model_type: {model_type}')
+
+    return model
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/models/moby.py b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/models/moby.py
new file mode 100644
index 0000000000..22f7353cb2
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/models/moby.py
@@ -0,0 +1,272 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Zhenda Xie
+# --------------------------------------------------------
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+
+from diffdist import functional
+
+from apex.contrib.combine_tensors import combine_npu
+
+from apex import amp
+
+
+def check_keywords_in_name(name, keywords=()):
+    isin = False
+    for keyword in keywords:
+        if keyword in name:
+            isin = True
+    return isin
+
+
+class NpuLinear(nn.Linear):
+    def forward(self, input):
+        output = input.matmul(self.weight.t())
+        if self.bias is not None:
+            output = output.npu_format_cast(2) + self.bias # use format ND(2) to enable high accuracy on NPU
+        return output
+        # return torch.npu_linear(input, self.weight, self.bias)
+
+
+def dist_collect(x):
+    """ collect all tensor from all GPUs
+    args:
+        x: shape (mini_batch, ...)
+    returns:
+        shape (mini_batch * num_gpu, ...)
+    """
+    x = x.contiguous()
+    out_list = [torch.zeros_like(x, device=x.device, dtype=x.dtype).contiguous()
+                for _ in range(dist.get_world_size())]
+    out_list = functional.all_gather(out_list, x)
+    return torch.cat(out_list, dim=0).contiguous()
+
+
+class MoBY(nn.Module):
+    def __init__(self,
+                 cfg,
+                 encoder,
+                 encoder_k,
+                 contrast_momentum=0.99,
+                 contrast_temperature=0.2,
+                 contrast_num_negative=4096,
+                 proj_num_layers=2,
+                 pred_num_layers=2,
+                 **kwargs):
+        super().__init__()
+        
+        self.cfg = cfg
+        
+        self.encoder = encoder
+        self.encoder_k = encoder_k
+        
+        self.contrast_momentum = contrast_momentum
+        self.contrast_temperature = contrast_temperature
+        self.contrast_num_negative = contrast_num_negative
+        
+        self.proj_num_layers = proj_num_layers
+        self.pred_num_layers = pred_num_layers
+
+        self.projector = MoBYMLP(in_dim=self.encoder.num_features, num_layers=proj_num_layers)
+        self.projector_k = MoBYMLP(in_dim=self.encoder.num_features, num_layers=proj_num_layers)
+        self.predictor = MoBYMLP(num_layers=pred_num_layers)
+        self.predictor_k = MoBYMLP(num_layers=pred_num_layers) # ensure consistant archi of q and k to use fused momentum update
+
+        for param_q, param_k in zip(self.encoder.parameters(), self.encoder_k.parameters()):
+            param_k.data.copy_(param_q.data)  # initialize
+            param_k.requires_grad = False  # not update by gradient
+
+        for param_q, param_k in zip(self.projector.parameters(), self.projector_k.parameters()):
+            param_k.data.copy_(param_q.data)
+            param_k.requires_grad = False
+
+        for param_q, param_k in zip(self.predictor.parameters(), self.predictor_k.parameters()):
+            param_k.data.copy_(param_q.data)
+            param_k.requires_grad = False
+
+        if self.cfg.MODEL.SWIN.NORM_BEFORE_MLP == 'bn':
+            nn.SyncBatchNorm.convert_sync_batchnorm(self.encoder)
+            nn.SyncBatchNorm.convert_sync_batchnorm(self.encoder_k)
+
+        nn.SyncBatchNorm.convert_sync_batchnorm(self.projector)
+        nn.SyncBatchNorm.convert_sync_batchnorm(self.projector_k)
+        nn.SyncBatchNorm.convert_sync_batchnorm(self.predictor)
+
+        self.K = int(self.cfg.DATA.TRAINING_IMAGES * 1. / dist.get_world_size() / self.cfg.DATA.BATCH_SIZE) * self.cfg.TRAIN.EPOCHS
+        self.k = int(self.cfg.DATA.TRAINING_IMAGES * 1. / dist.get_world_size() / self.cfg.DATA.BATCH_SIZE) * self.cfg.TRAIN.START_EPOCH
+
+        # create the queue
+        self.register_buffer("queue1", torch.randn(256, self.contrast_num_negative))
+        self.register_buffer("queue2", torch.randn(256, self.contrast_num_negative))
+        self.queue1 = F.normalize(self.queue1, dim=0)
+        self.queue2 = F.normalize(self.queue2, dim=0)
+
+        self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long))
+
+        # for fused momentum update
+        self.is_fused = False
+        self.step = 0
+
+    @torch.no_grad()
+    def _momentum_update_key_encoder(self):
+        """
+        Momentum update of the key encoder
+        """
+        _contrast_momentum = 1. - (1. - self.contrast_momentum) * (np.cos(np.pi * self.k / self.K) + 1) / 2.
+        self.k = self.k + 1
+
+        for param_q, param_k in zip(self.encoder.parameters(), self.encoder_k.parameters()):
+            param_k.data = param_k.data * _contrast_momentum + param_q.data * (1. - _contrast_momentum)
+
+        for param_q, param_k in zip(self.projector.parameters(), self.projector_k.parameters()):
+            param_k.data = param_k.data * _contrast_momentum + param_q.data * (1. - _contrast_momentum)
+
+    def _get_fused_params(self, params):
+        pg = []
+        for v in params:
+            if v.data.dtype.is_floating_point:
+                pg.append(v.data)
+        return combine_npu(pg)
+
+    def _get_fused_params_pg(self, model):
+        skip = {}
+        skip_keywords = {}
+        """
+        'relative_position_bias_table' in different group of 'MOBY' and 'SwinTransformer'
+        if hasattr(model, 'no_weight_decay'):
+            skip = model.no_weight_decay()
+        if hasattr(model, 'no_weight_decay_keywords'):
+            skip_keywords = model.no_weight_decay_keywords()
+        """
+        has_decay, no_decay = [], []
+        for name, param in model.named_parameters():
+            if len(param.shape) == 1 or name.endswith('.bias') or (name in skip) or \
+                check_keywords_in_name(name, skip_keywords):
+                no_decay.append(param)
+            else:
+                has_decay.append(param)
+        return has_decay, no_decay
+
+    @torch.no_grad()
+    def _fused_momentum_update_key_encoder(self, optimizer):
+        """
+        Update the key encoder with fused parameters in optimizer to accelerate on NPU
+        """
+        _contrast_momentum = 1. - (1. - self.contrast_momentum) * (np.cos(np.pi * self.k / self.K) + 1) / 2.
+        self.k = self.k + 1
+
+        d = _contrast_momentum
+        d_inv = 1. - d
+
+        if not self.is_fused:
+            self.param_q_fused = optimizer.get_model_combined_params()[0]
+
+            h0, n0 = self._get_fused_params_pg(self.encoder_k)
+            h1, n1 = self._get_fused_params_pg(self.projector_k)
+            h2, n2 = self._get_fused_params_pg(self.predictor_k)
+            self.param_k_fused = combine_npu(h0 + h1 + h2 + n0 + n1 + n2)
+
+            self.is_fused = True
+
+        self.param_k_fused *= d
+        self.param_k_fused += self.param_q_fused * d_inv
+
+    @torch.no_grad()
+    def _dequeue_and_enqueue(self, keys1, keys2):
+        # gather keys before updating queue
+        keys1 = dist_collect(keys1)
+        keys2 = dist_collect(keys2)
+
+        batch_size = keys1.shape[0]
+
+        ptr = int(self.queue_ptr)
+        assert self.contrast_num_negative % batch_size == 0  # for simplicity
+
+        # replace the keys at ptr (dequeue and enqueue)
+        self.queue1[:, ptr:ptr + batch_size] = keys1.T
+        self.queue2[:, ptr:ptr + batch_size] = keys2.T
+        ptr = (ptr + batch_size) % self.contrast_num_negative  # move pointer
+
+        self.queue_ptr[0] = ptr
+
+    def contrastive_loss(self, q, k, queue):
+
+        # positive logits: Nx1
+        l_pos = torch.einsum('nc,nc->n', [q.half(), k.half()]).unsqueeze(-1)
+        # negative logits: NxK
+        l_neg = torch.einsum('nc,ck->nk', [q.half(), queue.clone().detach().half()])
+
+        # logits: Nx(1+K)
+        logits = torch.cat([l_pos, l_neg], dim=1)
+
+        # apply temperature
+        logits /= self.contrast_temperature
+
+        # labels: positive key indicators
+        labels = torch.zeros(logits.shape[0], dtype=torch.long).npu()
+
+        return F.cross_entropy(logits, labels)
+
+    def forward(self, im_1, im_2, optimizer):
+        feat_1 = self.encoder(im_1)  # queries: NxC
+        proj_1 = self.projector(feat_1)
+        pred_1 = self.predictor(proj_1)
+        pred_1 = F.normalize(pred_1, dim=1)
+
+        feat_2 = self.encoder(im_2)
+        proj_2 = self.projector(feat_2)
+        pred_2 = self.predictor(proj_2)
+        pred_2 = F.normalize(pred_2, dim=1)
+
+        # compute key features
+        with torch.no_grad():  # no gradient to keys
+            if self.step == 0:
+                self._momentum_update_key_encoder()  # update the key encoder
+                self.step += 1
+            else:
+                self._fused_momentum_update_key_encoder(optimizer)
+
+            feat_1_ng = self.encoder_k(im_1)  # keys: NxC
+            proj_1_ng = self.projector_k(feat_1_ng)
+            proj_1_ng = F.normalize(proj_1_ng, dim=1)
+
+            feat_2_ng = self.encoder_k(im_2)
+            proj_2_ng = self.projector_k(feat_2_ng)
+            proj_2_ng = F.normalize(proj_2_ng, dim=1)
+
+        # compute loss
+        loss = self.contrastive_loss(pred_1, proj_2_ng, self.queue2) \
+            + self.contrastive_loss(pred_2, proj_1_ng, self.queue1)
+
+        self._dequeue_and_enqueue(proj_1_ng, proj_2_ng)
+
+        return loss
+    
+    
+class MoBYMLP(nn.Module):
+    def __init__(self, in_dim=256, inner_dim=4096, out_dim=256, num_layers=2):
+        super(MoBYMLP, self).__init__()
+        
+        # hidden layers
+        linear_hidden = [nn.Identity()]
+        for i in range(num_layers - 1):
+            linear_hidden.append(NpuLinear(in_dim if i == 0 else inner_dim, inner_dim))
+            linear_hidden.append(nn.BatchNorm1d(inner_dim))
+            linear_hidden.append(nn.ReLU(inplace=True))
+        self.linear_hidden = nn.Sequential(*linear_hidden)
+
+        self.linear_out = NpuLinear(in_dim if num_layers == 1 else inner_dim, out_dim) if num_layers >= 1 else nn.Identity()
+
+    def forward(self, x):
+        x = self.linear_hidden(x)
+        x = self.linear_out(x)
+
+        return x
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/models/swin_transformer.py b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/models/swin_transformer.py
new file mode 100644
index 0000000000..7db6fc91d4
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/models/swin_transformer.py
@@ -0,0 +1,750 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# Modified by Zhenda Xie
+# --------------------------------------------------------
+
+from unittest import result
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from apex import amp
+
+
+"""
+LayerNorm is able to run on CUBE in some cases for high-performance.
+To avoid unnecessary format transformation, a blacklist is set here.
+
+TODO: create blacklist automatically
+"""
+_LAYERNORM_FORMAT_NZ = True
+_LAYERNORM_FORMAT_NZ_BLACKLIST = {192, 384, 768, 1536}
+
+class FastGELU(nn.Module):
+    """fast version of nn.GELU()"""
+
+    @staticmethod
+    def forward(x):
+        return torch.fast_gelu(x)
+
+def drop_path(x, random_tensor, drop_prob: float = 0., training: bool = False):
+    """
+    Less op than timm version.
+    But memory copy and aicpu op 'Uniform' cannot be avoided, not faster apparently
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+    random_tensor_new = random_tensor.clone().detach().uniform_(keep_prob, keep_prob + 1).half()
+    random_tensor_new.floor_()
+    output = x.div(keep_prob)
+    return output
+
+class MatmulApply(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, self, mat2):
+        # y = a * b^T
+        ctx.save_for_backward(self, mat2)
+        result = torch.matmul(self, mat2.transpose(-2, -1))
+        return result
+    @staticmethod
+    def backward(ctx, grad):
+        # da: grad * b
+        # db: grad^T * a
+        self, mat2 = ctx.saved_tensors
+        self_grad = torch.npu_bmmV2(grad, mat2, [])
+        mat2_grad = torch.npu_bmmV2(grad.transpose(-2, -1), self, [])
+        return self_grad, mat2_grad
+
+matmul_transpose = MatmulApply.apply
+
+class RollIndexSelect(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, index_fp, index_bp):
+        N, H, W, C = input.shape
+        ctx.input = input
+        ctx.index_bp = index_bp
+        result = input.reshape(N, H * W, C).index_select(1, index_fp).reshape(N, H, W, C)
+        return result
+    @staticmethod
+    def backward(ctx, grad):
+        input = ctx.input
+        N, H, W, C = input.shape
+        index_bp = ctx.index_bp
+        grad_input = grad.reshape(N, H * W, C).index_select(1, index_bp).reshape(N, H, W, C)
+        return grad_input, None, None
+
+roll_index_select = RollIndexSelect.apply
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=FastGELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop) if drop > 0. else nn.Identity()
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, H, W, C)
+    """
+    B_, H_, W_, C_ = windows.shape
+
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+
+    # x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    C = int((B_ * H_ * W_ * C_) / (B * H * W))
+    x = x.npu_confusion_transpose([0, 1, 3, 2, 4, 5], (B, H, W, C), True)
+
+    return x
+
+
+class WindowAttention(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = torch.tensor(qk_scale) if qk_scale else torch.tensor(head_dim ** -0.5)
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop) if attn_drop > 0. else nn.Identity()
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop) if proj_drop > 0. else nn.Identity()
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    @amp.half_function
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4).contiguous().npu_format_cast(2)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        if not self.scale.device == q.device:
+            self.scale = self.scale.to(q.device).to(q.dtype)
+
+        q = q * self.scale
+        # attn = (q @ k.transpose(-2, -1))
+        attn = matmul_transpose(q, k)
+
+        # relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+        #     self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = torch.index_select(self.relative_position_bias_table, 0, self.relative_position_index.view(-1)).view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0).half()
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        # x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = (attn @ v).npu_format_cast(2).npu_confusion_transpose([0, 2, 1, 3], (B_, N, C), True)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
+
+    def flops(self, N):
+        # calculate flops for 1 window with token length of N
+        flops = 0
+        # qkv = self.qkv(x)
+        flops += N * self.dim * 3 * self.dim
+        # attn = (q @ k.transpose(-2, -1))
+        flops += self.num_heads * N * (self.dim // self.num_heads) * N
+        #  x = (attn @ v)
+        flops += self.num_heads * N * N * (self.dim // self.num_heads)
+        # x = self.proj(x)
+        flops += N * self.dim * self.dim
+        return flops
+
+
+def get_roll_index(H, W, shifts):
+    index = torch.arange(0, H * W).reshape(H, W)
+    index_fp = torch.roll(index, shifts=(shifts, shifts), dims=(0, 1)).reshape(-1).long()
+    index_bp = {i:idx for idx, i in enumerate(index_fp.numpy().tolist())}
+    index_bp = [index_bp[i] for i in range(H * W)]
+    index_bp = torch.LongTensor(index_bp)
+    return [index_fp, index_bp]
+
+class SwinTransformerBlock(nn.Module):
+    r""" Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=FastGELU, norm_layer=nn.LayerNorm, norm_before_mlp='ln'):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.norm_before_mlp = norm_before_mlp
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        if self.norm_before_mlp == 'ln':
+            self.norm2 = nn.LayerNorm(dim)
+        elif self.norm_before_mlp == 'bn':
+            self.norm2 = lambda x: nn.BatchNorm1d(dim)(x.transpose(1, 2)).transpose(1, 2)
+        else:
+            raise NotImplementedError
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+
+        self.register_buffer("attn_mask", attn_mask)
+
+        self.index_dict = {}
+        self.index_device = torch.device('cpu')
+        hw_list = [56, 28, 14, 7] # H/W of feature maps
+        for hw in hw_list:
+            H, W = hw, hw
+            self.index_dict[(H, W, self.shift_size)] = get_roll_index(H, W, self.shift_size)
+            self.index_dict[(H, W, -self.shift_size)] = get_roll_index(H, W, -self.shift_size)
+
+    def cast_index_device(self, device):
+        for v in self.index_dict.values():
+            v[0] = v[0].to(device)
+            v[1] = v[1].to(device)
+
+    @amp.half_function
+    def forward(self, x):
+        if not self.index_device == x.device:
+            self.cast_index_device(x.device)
+            self.index_device = x.device
+
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        if _LAYERNORM_FORMAT_NZ and x.size(-1) not in _LAYERNORM_FORMAT_NZ_BLACKLIST:
+            x = x.npu_format_cast(29)
+        x = self.norm1(x)
+        x = x.view(B, H, W, C).npu_format_cast(2)
+
+        # cyclic shift
+        if self.shift_size > 0:
+            # shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            index_fp = self.index_dict[(H, W, -self.shift_size)][0]
+            index_bp = self.index_dict[(H, W, -self.shift_size)][1]
+            shifted_x = roll_index_select(x, index_fp, index_bp)
+        else:
+            shifted_x = x
+
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            # x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+            index_fp = self.index_dict[(H, W, self.shift_size)][0]
+            index_bp = self.index_dict[(H, W, self.shift_size)][1]
+            x = roll_index_select(shifted_x, index_fp, index_bp)
+        else:
+            x = shifted_x
+        x = x.view(B, H * W, C)
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+
+        # x = x + self.drop_path(self.mlp(self.norm2(x)))
+        if _LAYERNORM_FORMAT_NZ and x.size(-1) not in _LAYERNORM_FORMAT_NZ_BLACKLIST:
+            x = x + self.drop_path(self.mlp(self.norm2(x.npu_format_cast(29))))
+        else:
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+
+    def flops(self):
+        flops = 0
+        H, W = self.input_resolution
+        # norm1
+        flops += self.dim * H * W
+        # W-MSA/SW-MSA
+        nW = H * W / self.window_size / self.window_size
+        flops += nW * self.attn.flops(self.window_size * self.window_size)
+        # mlp
+        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
+        # norm2
+        flops += self.dim * H * W
+        return flops
+
+
+class PatchMerging(nn.Module):
+    r""" Patch Merging Layer.
+
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+        """
+        Using depth-wise conv2d to merge patches for high performance.
+        """
+        C_list = [96, 192, 384]
+        self.kernel_dict = {}
+        self.kernel_device = torch.device('cpu')
+        for c in C_list:
+            kernel0 = torch.FloatTensor([[1, 0], [0, 0]]).unsqueeze(0).unsqueeze(0).repeat(c, 1, 1, 1)
+            kernel1 = torch.FloatTensor([[0, 0], [1, 0]]).unsqueeze(0).unsqueeze(0).repeat(c, 1, 1, 1)
+            kernel2 = torch.FloatTensor([[0, 1], [0, 0]]).unsqueeze(0).unsqueeze(0).repeat(c, 1, 1, 1)
+            kernel3 = torch.FloatTensor([[0, 0], [0, 1]]).unsqueeze(0).unsqueeze(0).repeat(c, 1, 1, 1)
+            kernel = torch.cat([kernel0, kernel1, kernel2, kernel3], 0)
+            self.kernel_dict[c] = kernel
+
+    def cast_kernel_device(self, device):
+        for k, v in self.kernel_dict.items():
+            self.kernel_dict[k] = v.to(device)
+
+    @amp.half_function
+    def forward(self, x):
+        """
+        x: B, H*W, C
+
+        A depth-wise conv2d version with save semantics of following op
+        # x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        # x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        # x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        # x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        # x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        """
+        if not self.kernel_device == x.device:
+            self.cast_kernel_device(x.device)
+            self.kernel_device = x.device
+
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
+
+        x = x.view(B, H, W, C)
+
+        # depth-conv2d version
+        x = x.permute(0, 3, 1, 2).repeat(1, 4, 1, 1) # B 4*C H W
+        kernel = self.kernel_dict[C]
+        x = torch.nn.functional.conv2d(x, kernel, stride=2, groups=4*C) # B 4*C H/2 W/2
+        x = x.npu_format_cast(0).npu_confusion_transpose([0, 2, 3, 1], (B, int(H * W / 4), 4 * C), True)
+
+        if _LAYERNORM_FORMAT_NZ and x.size(-1) not in _LAYERNORM_FORMAT_NZ_BLACKLIST:
+            x = x.npu_format_cast(2).npu_format_cast(29).contiguous()
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+    def extra_repr(self) -> str:
+        return f"input_resolution={self.input_resolution}, dim={self.dim}"
+
+    def flops(self):
+        H, W = self.input_resolution
+        flops = H * W * self.dim
+        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
+        return flops
+
+
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False,
+                 norm_before_mlp='ln'):
+
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
+                                 num_heads=num_heads, window_size=window_size,
+                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
+                                 drop=drop, attn_drop=attn_drop,
+                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                                 norm_layer=norm_layer, norm_before_mlp=norm_before_mlp)
+            for i in range(depth)])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+
+    def flops(self):
+        flops = 0
+        for blk in self.blocks:
+            flops += blk.flops()
+        if self.downsample is not None:
+            flops += self.downsample.flops()
+        return flops
+
+
+class PatchEmbed(nn.Module):
+    r""" Image to Patch Embedding
+
+    Args:
+        img_size (int): Image size.  Default: 224.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2).contiguous()  # B Ph*Pw C
+        if self.norm is not None:
+            if _LAYERNORM_FORMAT_NZ and x.size(-1) not in _LAYERNORM_FORMAT_NZ_BLACKLIST:
+                x = x.npu_format_cast(2).npu_format_cast(29)
+            x = self.norm(x)
+        return x
+
+    def flops(self):
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
+
+
+class SwinTransformer(nn.Module):
+    r""" Swin Transformer
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+
+    Args:
+        img_size (int | tuple(int)): Input image size. Default 224
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
+    """
+
+    def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000,
+                 embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24],
+                 window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm, ape=False, patch_norm=True,
+                 use_checkpoint=False, norm_before_mlp='ln', **kwargs):
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
+        self.mlp_ratio = mlp_ratio
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+
+        self.pos_drop = nn.Dropout(p=drop_rate) if drop_rate > 0. else nn.Identity()
+
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer),
+                               input_resolution=(patches_resolution[0] // (2 ** i_layer),
+                                                 patches_resolution[1] // (2 ** i_layer)),
+                               depth=depths[i_layer],
+                               num_heads=num_heads[i_layer],
+                               window_size=window_size,
+                               mlp_ratio=self.mlp_ratio,
+                               qkv_bias=qkv_bias, qk_scale=qk_scale,
+                               drop=drop_rate, attn_drop=attn_drop_rate,
+                               drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                               norm_layer=norm_layer,
+                               downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                               use_checkpoint=use_checkpoint,
+                               norm_before_mlp=norm_before_mlp)
+            self.layers.append(layer)
+
+        self.norm = norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'absolute_pos_embed'}
+
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'relative_position_bias_table'}
+
+    @amp.half_function
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x = layer(x)
+
+        x = self.norm(x)  # B L C
+        x = self.avgpool(x.transpose(1, 2))  # B C 1
+        x = torch.flatten(x, 1)
+        return x
+
+    @amp.half_function
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+    def flops(self):
+        flops = 0
+        flops += self.patch_embed.flops()
+        for i, layer in enumerate(self.layers):
+            flops += layer.flops()
+        flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers)
+        flops += self.num_features * self.num_classes
+        return flops
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/optimizer.py b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/optimizer.py
new file mode 100644
index 0000000000..2faceda9f0
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/optimizer.py
@@ -0,0 +1,58 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------
+
+from torch import optim as optim
+
+
+def build_optimizer(config, model):
+    """
+    Build optimizer, set weight decay of normalization to 0 by default.
+    """
+    skip = {}
+    skip_keywords = {}
+    if hasattr(model, 'no_weight_decay'):
+        skip = model.no_weight_decay()
+    if hasattr(model, 'no_weight_decay_keywords'):
+        skip_keywords = model.no_weight_decay_keywords()
+    parameters = set_weight_decay(model, skip, skip_keywords)
+
+    opt_lower = config.TRAIN.OPTIMIZER.NAME.lower()
+    optimizer = None
+    if opt_lower == 'sgd':
+        optimizer = optim.SGD(parameters, momentum=config.TRAIN.OPTIMIZER.MOMENTUM, nesterov=True,
+                              lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY)
+    elif opt_lower == 'adamw':
+        from apex.optimizers import NpuFusedAdamW
+        optimizer = NpuFusedAdamW(parameters, eps=config.TRAIN.OPTIMIZER.EPS, betas=config.TRAIN.OPTIMIZER.BETAS,
+                                lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY)
+
+    return optimizer
+
+
+def set_weight_decay(model, skip_list=(), skip_keywords=()):
+    has_decay = []
+    no_decay = []
+
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if len(param.shape) == 1 or name.endswith(".bias") or (name in skip_list) or \
+                check_keywords_in_name(name, skip_keywords):
+            no_decay.append(param)
+            # print(f"{name} has no weight decay")
+        else:
+            has_decay.append(param)
+    return [{'params': has_decay},
+            {'params': no_decay, 'weight_decay': 0.}]
+
+
+def check_keywords_in_name(name, keywords=()):
+    isin = False
+    for keyword in keywords:
+        if keyword in name:
+            isin = True
+    return isin
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/run8p_linear_evaluation.sh b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/run8p_linear_evaluation.sh
new file mode 100644
index 0000000000..cd8463dbcc
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/run8p_linear_evaluation.sh
@@ -0,0 +1,2 @@
+python -m torch.distributed.launch --nproc_per_node 8 --master_port 12345 moby_linear.py \
+--cfg configs/moby_swin_tiny.yaml --data-path /data/imagenet > linear.log 2>&1 &
\ No newline at end of file
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/run8p_pretrain.sh b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/run8p_pretrain.sh
new file mode 100644
index 0000000000..fe2a96ba78
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/run8p_pretrain.sh
@@ -0,0 +1,14 @@
+source env_npu.sh
+
+export WORLD_SIZE=8
+for i in $(seq 0 7)
+do
+    export RANK=$i
+    start=$((24 * i))
+    end=$((start + 23))
+    taskset -c $start-$end nohup python -u moby_main.py \
+            --cfg configs/moby_swin_tiny.yaml \
+            --data-path /data/imagenet \
+            --local_rank $i \
+            --batch-size 128 > train_${i}.log 2>&1 &
+done
\ No newline at end of file
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/env_npu.sh b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/env_npu.sh
new file mode 100644
index 0000000000..c5f14bc83e
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/env_npu.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+export install_path=/usr/local/Ascend
+
+if [ -d ${install_path}/toolkit ]; then
+    export LD_LIBRARY_PATH=/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH}
+    export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH
+    export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH
+    export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH
+    export ASCEND_OPP_PATH=${install_path}/opp
+else
+    if [ -d ${install_path}/nnae/latest ];then
+        export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/nnae/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/nnae/latest
+    else
+        export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest
+    fi
+fi
+
+
+#将Host日志输出到串口,0-关闭/1-开启
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+#设置默认日志级别,0-debug/1-info/2-warning/3-error
+export ASCEND_GLOBAL_LOG_LEVEL=3
+#设置Event日志开启标志,0-关闭/1-开启
+export ASCEND_GLOBAL_EVENT_ENABLE=0
+#设置是否开启taskque,0-关闭/1-开启
+export TASK_QUEUE_ENABLE=1
+#设置是否开启PTCopy,0-关闭/1-开启
+export PTCOPY_ENABLE=1
+#设置是否开启combined标志,0-关闭/1-开启
+export COMBINED_ENABLE=1
+#设置特殊场景是否需要重新编译,不需要修改
+export DYNAMIC_OP="ADD#MUL"
+#HCCL白名单开关,1-关闭/0-开启
+export HCCL_WHITELIST_DISABLE=1
+export HCCL_IF_IP=$(hostname -I |awk '{print $1}')
+
+export SCALAR_TO_HOST_MEM=1
+export BMMV2_ENABLE=1
+
+#设置device侧日志登记为error
+${install_path}/driver/tools/msnpureport -g error -d 0
+${install_path}/driver/tools/msnpureport -g error -d 1
+${install_path}/driver/tools/msnpureport -g error -d 2
+${install_path}/driver/tools/msnpureport -g error -d 3
+${install_path}/driver/tools/msnpureport -g error -d 4
+${install_path}/driver/tools/msnpureport -g error -d 5
+${install_path}/driver/tools/msnpureport -g error -d 6
+${install_path}/driver/tools/msnpureport -g error -d 7
+#关闭Device侧Event日志
+${install_path}/driver/tools/msnpureport -e disable
+
+path_lib=$(python3.7 -c """
+import sys
+import re
+result=''
+for index in range(len(sys.path)):
+    match_sit = re.search('-packages', sys.path[index])
+    if match_sit is not None:
+        match_lib = re.search('lib', sys.path[index])
+
+        if match_lib is not None:
+            end=match_lib.span()[1]
+            result += sys.path[index][0:end] + ':'
+
+        result+=sys.path[index] + '/torch/lib:'
+print(result)"""
+)
+
+echo ${path_lib}
+
+export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/eval_8p.sh b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/eval_8p.sh
new file mode 100644
index 0000000000..ef8490219f
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/eval_8p.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+export RANK_SIZE=8
+RANK_ID_START=0
+
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#网络名称,同目录名称,需要模型审视修改
+Network="Transformer-SSL_for_PyTorch"
+
+#训练batch_size,,需要模型审视修改
+batch_size=128
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+    if [[ $para == --conda_name* ]];then
+      conda_name=`echo ${para#*=}`
+      echo "PATH TRAIN BEFORE: $PATH"
+      source set_conda.sh --conda_name=$conda_name
+      source activate $conda_name
+      echo "PATH TRAIN AFTER: $PATH"
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${cur_path}/env_npu.sh
+fi
+
+#进入训练脚本目录，需要模型审视修改
+cd $cur_path/
+
+#设置环境变量，不需要修改
+RANK_ID=0
+ASCEND_DEVICE_ID=0
+echo "Device ID: $RANK_ID"
+export RANK_ID=$RANK_ID
+export ASCEND_DEVICE_ID=$RANK_ID
+ASCEND_DEVICE_ID=$RANK_ID
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+
+#执行训练脚本，以下传参不需要修改，其他需要模型审视修改
+export RANK_SIZE=8
+
+python3.7 -m torch.distributed.launch --nproc_per_node 8 --master_port 12345 moby_linear.py \
+--cfg configs/moby_swin_tiny.yaml --data-path ${data_path} > ${cur_path}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+#8p情况下仅0卡(主节点)有完整日志,因此后续日志提取仅涉及0卡
+ASCEND_DEVICE_ID=0
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+time=`grep -a 'time'  $cur_path/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log|grep loss|awk -F " " 'END {print $14}'|sed 's/(//g'|sed 's/)//g'`
+FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${time}'*8}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a 'Max accuracy' $cur_path/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log|awk -F " " 'END {print $8}'|sed 's/%//g'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#稳定性精度看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${time}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要模型审视修改
+grep 'INFO Train' $cur_path/output/$ASCEND_DEVICE_ID/eval_$ASCEND_DEVICE_ID.log|awk -F "loss " '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/eval_${CaseName}_loss.txt|sed 's/.$//'`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+#退出anaconda环境
+if [ -n "$conda_name" ];then
+    echo "conda $conda_name deactivate"
+    conda deactivate
+fi
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_full_8p.sh
new file mode 100644
index 0000000000..213bc33037
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_full_8p.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+export RANK_SIZE=8
+RANK_ID_START=0
+
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#网络名称,同目录名称,需要模型审视修改
+Network="Transformer-SSL_for_PyTorch"
+
+#训练batch_size,,需要模型审视修改
+batch_size=128
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+    if [[ $para == --conda_name* ]];then
+      conda_name=`echo ${para#*=}`
+      echo "PATH TRAIN BEFORE: $PATH"
+      source set_conda.sh --conda_name=$conda_name
+      source activate $conda_name
+      echo "PATH TRAIN AFTER: $PATH"
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${cur_path}/env_npu.sh
+fi
+
+#进入训练脚本目录，需要模型审视修改
+cd $cur_path/
+
+#设置环境变量，不需要修改
+RANK_ID=0
+ASCEND_DEVICE_ID=0
+echo "Device ID: $RANK_ID"
+export RANK_ID=$RANK_ID
+export ASCEND_DEVICE_ID=$RANK_ID
+ASCEND_DEVICE_ID=$RANK_ID
+
+#创建DeviceID输出目录，不需要修改
+if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/
+else
+    mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/
+fi
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+
+#执行训练脚本，以下传参不需要修改，其他需要模型审视修改
+export RANK_SIZE=8
+
+export WORLD_SIZE=8
+KERNEL_NUM=$(($(nproc)/8))
+for((RANK_ID=0;RANK_ID<RANK_SIZE;RANK_ID++))
+do
+    export RANK=$RANK_ID
+
+    if [ $(uname -m) = "aarch64" ]
+    then
+        PID_START=$((KERNEL_NUM * RANK_ID))
+        PID_END=$((PID_START + KERNEL_NUM - 1))
+        taskset -c $PID_START-$PID_END python3.7 -u moby_main.py \
+            --cfg configs/moby_swin_tiny.yaml \
+            --data-path $data_path \
+            --local_rank $RANK_ID \
+            --batch-size $batch_size > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+    else
+        python3.7 -u moby_main.py \
+            --cfg configs/moby_swin_tiny.yaml \
+            --data-path $data_path \
+            --local_rank $RANK_ID \
+            --batch-size $batch_size > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+    fi
+done
+
+wait
+
+#8p情况下仅0卡(主节点)有完整日志,因此后续日志提取仅涉及0卡
+ASCEND_DEVICE_ID=0
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+time=`grep -a 'time'  $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep loss|awk -F " " 'END {print $14}'|sed 's/(//g'|sed 's/)//g'`
+FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${time}'*8}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a 'Precision' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $NF}'|head -1`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#稳定性精度看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${time}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要模型审视修改
+grep 'INFO Train' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "loss " '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt|sed 's/.$//'`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+#退出anaconda环境
+if [ -n "$conda_name" ];then
+    echo "conda $conda_name deactivate"
+    conda deactivate
+fi
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_performance_1p.sh
new file mode 100644
index 0000000000..856cc6eec2
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_performance_1p.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+export RANK_SIZE=1
+
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#网络名称,同目录名称,需要模型审视修改
+Network="Transformer-SSL_for_PyTorch"
+
+#训练batch_size,,需要模型审视修改
+batch_size=128
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+    if [[ $para == --conda_name* ]];then
+      conda_name=`echo ${para#*=}`
+      echo "PATH TRAIN BEFORE: $PATH"
+      source set_conda.sh --conda_name=$conda_name
+      source activate $conda_name
+      echo "PATH TRAIN AFTER: $PATH"
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${cur_path}/env_npu.sh
+fi
+
+#进入训练脚本目录，需要模型审视修改
+ASCEND_DEVICE_ID=0
+
+#创建DeviceID输出目录，不需要修改
+if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/
+else
+    mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/
+fi
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+
+#执行训练脚本，以下传参不需要修改，其他需要模型审视修改
+python3.7 -m torch.distributed.launch --nproc_per_node 1 --master_port 12345  moby_main.py \
+    --cfg configs/moby_swin_tiny.yaml \
+    --data-path $data_path \
+    --steps 1000 \
+    --batch-size 128 > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+time=`grep -a 'time'  $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep loss|awk -F " " 'END {print $13}'`
+FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${time}'}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#打印，不需要修改
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${time}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要模型审视修改
+grep 'INFO Train' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "loss " '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+#退出anaconda环境
+if [ -n "$conda_name" ];then
+    echo "conda $conda_name deactivate"
+    conda deactivate
+fi
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_performance_8p.sh
new file mode 100644
index 0000000000..0e17d26031
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_performance_8p.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+export RANK_SIZE=8
+RANK_ID_START=0
+
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#网络名称,同目录名称,需要模型审视修改
+Network="Transformer-SSL_for_PyTorch"
+
+#训练batch_size,,需要模型审视修改
+batch_size=128
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+    if [[ $para == --conda_name* ]];then
+      conda_name=`echo ${para#*=}`
+      echo "PATH TRAIN BEFORE: $PATH"
+      source set_conda.sh --conda_name=$conda_name
+      source activate $conda_name
+      echo "PATH TRAIN AFTER: $PATH"
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${cur_path}/env_npu.sh
+fi
+
+#进入训练脚本目录，需要模型审视修改
+cd $cur_path/
+
+#设置环境变量，不需要修改
+RANK_ID=0
+ASCEND_DEVICE_ID=0
+echo "Decive ID: $RANK_ID"
+export RANK_ID=$RANK_ID
+export ASCEND_DEVICE_ID=$RANK_ID
+ASCEND_DEVICE_ID=$RANK_ID
+
+#创建DeviceID输出目录，不需要修改
+if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/
+else
+    mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/
+fi
+
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+
+#执行训练脚本，以下传参不需要修改，其他需要模型审视修改
+export RANK_SIZE=8
+
+KERNEL_NUM=$(($(nproc)/8))
+
+export WORLD_SIZE=8
+for((RANK_ID=0;RANK_ID<RANK_SIZE;RANK_ID++))
+do
+    export RANK=$RANK_ID
+
+    if [ $(uname -m) = "aarch64" ]
+    then
+        PID_START=$((KERNEL_NUM * RANK_ID))
+        PID_END=$((PID_START + KERNEL_NUM - 1))
+        taskset -c $PID_START-$PID_END python3.7 -u moby_main.py \
+            --cfg configs/moby_swin_tiny.yaml \
+            --data-path $data_path \
+            --epochs 2 \
+            --local_rank $RANK_ID \
+            --batch-size $batch_size > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+    else
+        python3.7 -u moby_main.py \
+            --cfg configs/moby_swin_tiny.yaml \
+            --data-path $data_path \
+            --epochs 2 \
+            --local_rank $RANK_ID \
+            --batch-size $batch_size > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+    fi
+done
+
+wait
+
+#8p情况下仅0卡(主节点)有完整日志,因此后续日志提取仅涉及0卡
+ASCEND_DEVICE_ID=0
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+time=`grep -a 'time'  $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep loss|awk -F " " 'END {print $14}'|sed 's/(//g'|sed 's/)//g'`
+FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${time}'*8}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#打印，不需要修改
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${time}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要模型审视修改
+grep 'INFO Train' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "loss " '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+#退出anaconda环境
+if [ -n "$conda_name" ];then
+    echo "conda $conda_name deactivate"
+    conda deactivate
+fi
diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/utils.py b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/utils.py
new file mode 100644
index 0000000000..dd68a4abb5
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/utils.py
@@ -0,0 +1,114 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# Modified by Zhenda Xie
+# --------------------------------------------------------
+
+import os
+import torch
+import torch.distributed as dist
+
+try:
+    # noinspection PyUnresolvedReferences
+    from apex import amp
+except ImportError:
+    amp = None
+
+
+def load_pretrained(model, ckpt_path, logger):
+    model_dict = model.state_dict()
+    
+    state_dict = torch.load(ckpt_path, map_location='cpu')['model']
+    state_dict = {k.replace('encoder.', ''): v for k, v in state_dict.items() if 'encoder.' in k}
+    
+    for k in model_dict.keys():
+        if 'head' in k:
+            state_dict[k] = model_dict[k]
+    
+    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+    
+    logger.info(f'loaded pretrained checkpoint from: {ckpt_path}')
+    if len(missing_keys) > 0 or len(unexpected_keys) > 0:
+        logger.warning(f'Missing keys: {missing_keys}\nUnexpected keys: {unexpected_keys}')
+
+
+def load_checkpoint(config, model, optimizer, lr_scheduler, logger):
+    logger.info(f"==============> Resuming form {config.MODEL.RESUME}....................")
+    if config.MODEL.RESUME.startswith('https'):
+        checkpoint = torch.hub.load_state_dict_from_url(
+            config.MODEL.RESUME, map_location='cpu', check_hash=True)
+    else:
+        checkpoint = torch.load(config.MODEL.RESUME, map_location='cpu')
+    msg = model.load_state_dict(checkpoint['model'], strict=False)
+    logger.info(msg)
+    max_accuracy = 0.0
+    if not config.EVAL_MODE and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        config.defrost()
+        config.TRAIN.START_EPOCH = checkpoint['epoch'] + 1
+        config.freeze()
+        if 'amp' in checkpoint and config.AMP_OPT_LEVEL != "O0" and checkpoint['config'].AMP_OPT_LEVEL != "O0":
+            amp.load_state_dict(checkpoint['amp'])
+        logger.info(f"=> loaded successfully '{config.MODEL.RESUME}' (epoch {checkpoint['epoch']})")
+        if 'max_accuracy' in checkpoint:
+            max_accuracy = checkpoint['max_accuracy']
+
+    del checkpoint
+    torch.npu.empty_cache()
+    return max_accuracy
+
+
+def save_checkpoint(config, epoch, model, max_accuracy, optimizer, lr_scheduler, logger):
+    save_state = {'model': model.state_dict(),
+                  'optimizer': optimizer.state_dict(),
+                  'lr_scheduler': lr_scheduler.state_dict(),
+                  'max_accuracy': max_accuracy,
+                  'epoch': epoch,
+                  'config': config}
+    if config.AMP_OPT_LEVEL != "O0":
+        save_state['amp'] = amp.state_dict()
+
+    save_path = os.path.join(config.OUTPUT, f'ckpt_epoch_{epoch}.pth')
+    logger.info(f"{save_path} saving......")
+    torch.save(save_state, save_path)
+    torch.save(save_state, os.path.join(config.OUTPUT, f'checkpoint.pth'))
+    logger.info(f"{save_path} saved !!!")
+
+
+def get_grad_norm(parameters, norm_type=2):
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    norm_type = float(norm_type)
+    total_norm = 0
+    for p in parameters:
+        param_norm = p.grad.data.norm(norm_type)
+        total_norm += param_norm.item() ** norm_type
+    total_norm = total_norm ** (1. / norm_type)
+    return total_norm
+
+
+def auto_resume_helper(output_dir):
+    if os.path.exists(os.path.join(output_dir, 'checkpoint.pth')):
+        return os.path.join(output_dir, 'checkpoint.pth')
+    
+    checkpoints = os.listdir(output_dir)
+    checkpoints = [ckpt for ckpt in checkpoints if ckpt.endswith('pth')]
+    print(f"All checkpoints founded in {output_dir}: {checkpoints}")
+    if len(checkpoints) > 0:
+        latest_checkpoint = max([os.path.join(output_dir, d) for d in checkpoints], key=os.path.getmtime)
+        print(f"The latest checkpoint founded: {latest_checkpoint}")
+        resume_file = latest_checkpoint
+    else:
+        resume_file = None
+    return resume_file
+
+
+def reduce_tensor(tensor):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
+    rt /= dist.get_world_size()
+    return rt
-- 
Gitee