diff --git a/ACL_PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/.keep b/ACL_PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/.keep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ACL_PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test.py b/ACL_PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfcd5066e715f7151f45e42be2843f6bf4e3563e
--- /dev/null
+++ b/ACL_PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test.py
@@ -0,0 +1 @@
+import torch
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/LICENSE b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..517057158039f458ca1ac1341be58c1148b6b552
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/LICENSE
@@ -0,0 +1,30 @@
+BSD 3-Clause License
+
+Copyright (c) 2017, 
+All rights reserved.
+Copyright 2022 Huawei Technologies Co., Ltd
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/OXInterface.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/OXInterface.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc4bc085fa14ef6c0f4434a0e703d94d241af8f7
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/OXInterface.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/coco_eval.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/coco_eval.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fdcebb56a7f46785cded01bb7d2a3ff13e7cff89
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/coco_eval.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/engine.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/engine.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac3e653e5de063ae2cb252f772cbf1310ca05563
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/engine.cpython-36.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/engine.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/engine.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6cb1ae46dc90919c05212f3fa6b666291923fff8
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/engine.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/hubconf.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/hubconf.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91c1504cf6d9d4e1146ca2351e73c6bd73723713
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/hubconf.cpython-36.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/hubconf.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/hubconf.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ca77d8fa5294d321996f19a34eff2ff991fb3a3
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/hubconf.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/npu_fused_adamw.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/npu_fused_adamw.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f72da710f540d1da76f6d5cbc5b7f9e47a67b83
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/npu_fused_adamw.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/train_npu_hw.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/train_npu_hw.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aeffc9d3c26caf6a223af3a9cb046ba86c35bddd
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/train_npu_hw.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/coco_eval.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/coco_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7bbe6bd8e01b9e5f45e0bb6a1503403f9fa8121
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/coco_eval.py
@@ -0,0 +1,81 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Train and eval functions used in main.py
+"""
+
+import os
+import torch
+import util.misc as utils
+from datasets.coco_eval import CocoEvaluator
+import onnxruntime
+
+
+@torch.no_grad()
+def evaluate(model, criterion, postprocessors, data_loader, base_ds, device, output_dir):
+    model.eval()
+    criterion.eval()
+
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}'))
+    header = 'Test:'
+
+    iou_types = tuple(k for k in ('segm', 'bbox') if k in postprocessors.keys())
+    coco_evaluator = CocoEvaluator(base_ds, iou_types)
+    # coco_evaluator.coco_eval[iou_types[0]].params.iouThrs = [0, 0.1, 0.5, 0.75]
+
+
+    # ort_session = onnxruntime.InferenceSession('model_file/detr_640.onnx')
+    for samples, targets in metric_logger.log_every(data_loader, 10, header):
+        samples = samples.to(device)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        print(samples.tensors.shape)
+
+        # onnx_input=torch.randn(1,3,750,800).numpy()
+
+        # ort_inputs = {ort_session.get_inputs()[0].name:samples.tensors.cpu().numpy()}
+        # # print('inputs',ort_inputs)
+        # ort_outs = ort_session.run(None, ort_inputs)
+        # out={'pred_logits':torch.from_numpy(ort_outs[0]).cuda(),
+        #      'pred_boxes':torch.from_numpy(ort_outs[1]).cuda()}
+        # outputs=out
+        # loss_dict=criterion(out,targets)
+
+        outputs = model(samples)
+
+        loss_dict = criterion(outputs, targets)
+
+        weight_dict = criterion.weight_dict
+
+        # reduce losses over all GPUs for logging purposes
+        loss_dict_reduced = utils.reduce_dict(loss_dict)
+        loss_dict_reduced_scaled = {k: v * weight_dict[k]
+                                    for k, v in loss_dict_reduced.items() if k in weight_dict}
+        loss_dict_reduced_unscaled = {f'{k}_unscaled': v
+                                      for k, v in loss_dict_reduced.items()}
+        metric_logger.update(loss=sum(loss_dict_reduced_scaled.values()),
+                             **loss_dict_reduced_scaled,
+                             **loss_dict_reduced_unscaled)
+        metric_logger.update(class_error=loss_dict_reduced['class_error'])
+
+        orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
+        results = postprocessors['bbox'](outputs, orig_target_sizes)
+        print(len(results[0]['scores']),results[0]['scores'])
+        print(results[0]['boxes'])
+        print(results[0]['labels'])
+        print(postprocessors.keys())
+        res = {target['image_id'].item(): output for target, output in zip(targets, results)}
+        coco_evaluator.update(res)
+
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    coco_evaluator.synchronize_between_processes()
+
+    # accumulate predictions from all images
+    coco_evaluator.accumulate()
+    coco_evaluator.summarize()
+    stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist()
+
+    return stats, coco_evaluator
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__init__.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..571b126ea4ed8db85bc75ff7947d674b8a5a2099
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch.utils.data
+import torchvision
+
+from .coco import build as build_coco
+
+
+def get_coco_api_from_dataset(dataset):
+    for _ in range(10):
+        # if isinstance(dataset, torchvision.datasets.CocoDetection):
+        #     break
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+
+
+def build_dataset(image_set, args):
+    if args.dataset_file == 'coco':
+        return build_coco(image_set, args)
+    if args.dataset_file == 'coco_panoptic':
+        # to avoid making panopticapi required for coco
+        from .coco_panoptic import build as build_coco_panoptic
+        return build_coco_panoptic(image_set, args)
+    raise ValueError(f'dataset {args.dataset_file} not supported')
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/__init__.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/__init__.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..816728a01044b95762478d70b371822e2ffcd19e
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/__init__.cpython-36.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/__init__.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..69983b6ba7fbe379137776a3c57350fb23ab5b16
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/__init__.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..689f41ab91d174dea6ef11587aa851ca4b5aec2e
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco.cpython-36.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd72017b858cf9e469a04406af63d77a8f454623
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco_eval.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco_eval.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0a4bb3ac128cd0c9e7f43d79e205e0142980752
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco_eval.cpython-36.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco_eval.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco_eval.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6c4d22531816b7721e85752cdb07aa03d9f7468
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco_eval.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/panoptic_eval.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/panoptic_eval.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f011b0b88ed8100cdf362fe3682d349a156ea783
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/panoptic_eval.cpython-36.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/panoptic_eval.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/panoptic_eval.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0224c4baf0778ef1ab589c72a71b8707f21c6286
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/panoptic_eval.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/transforms.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/transforms.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..763db09f26c435b163b41b135013ab5df796043a
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/transforms.cpython-36.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/transforms.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/transforms.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55f3cbecf4b2d8b549aa7a097884fd136156f530
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/transforms.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/coco.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e94e8d5026dec45f152fd3d525e36f5af0cb6ad9
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/coco.py
@@ -0,0 +1,166 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+COCO dataset which returns image_id for evaluation.
+
+Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
+"""
+from pathlib import Path
+
+import torch
+import torch.utils.data
+import torchvision
+from pycocotools import mask as coco_mask
+
+import datasets.transforms as T
+
+
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms, return_masks):
+        super(CocoDetection, self).__init__(img_folder, ann_file)
+        self._transforms = transforms
+        self.prepare = ConvertCocoPolysToMask(return_masks)
+
+    def __getitem__(self, idx):
+        img, target = super(CocoDetection, self).__getitem__(idx)
+        image_id = self.ids[idx]
+        target = {'image_id': image_id, 'annotations': target}
+        img, target = self.prepare(img, target)
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
+        return img, target
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+class ConvertCocoPolysToMask(object):
+    def __init__(self, return_masks=False):
+        self.return_masks = return_masks
+
+    def __call__(self, image, target):
+        w, h = image.size
+
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # print(boxes)
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+
+        if self.return_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        if self.return_masks:
+            masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        if self.return_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+
+        target["orig_size"] = torch.as_tensor([int(h), int(w)])
+        target["size"] = torch.as_tensor([int(h), int(w)])
+
+        return image, target
+
+
+def make_coco_transforms(image_set):
+
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.RandomSelect(
+                # T.RandomResize(scales, max_size=1333),
+                T.pad_resize(),
+                T.Compose([
+                    # T.RandomResize([400, 500, 600]),
+                    T.pad_resize(),
+                    T.RandomSizeCrop(384, 600),
+                    # T.RandomResize(scales, max_size=1333),
+                    T.pad_resize(),
+                ])
+            ),
+            normalize,
+        ])
+
+    if image_set == 'val':
+        return T.Compose([
+
+            T.pad_resize(),
+            # T.RandomResize(sizes=(640,640),),
+            # T.RandomResize([800], max_size=1333),
+
+            normalize,
+        ])
+
+    raise ValueError(f'unknown {image_set}')
+
+
+def build(image_set, args):
+    root = Path(args.coco_path)
+    assert root.exists(), f'provided COCO path {root} does not exist'
+    mode = 'instances'
+    PATHS = {
+        "train": (root / "train2017", root / "annotations" / f'{mode}_train2017.json'),
+        "val": (root / "val2017", root / "annotations" / f'{mode}_val2017.json'),
+    }
+
+    img_folder, ann_file = PATHS[image_set]
+    dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks)
+    return dataset
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/coco_eval.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/coco_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..9487c08fd6b5da041facd4bd6c0b13c40a16df7d
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/coco_eval.py
@@ -0,0 +1,257 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+COCO evaluator that works in distributed mode.
+
+Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
+The difference is that there is less copy-pasting from pycocotools
+in the end of the file, as python3 can suppress prints with contextlib
+"""
+import os
+import contextlib
+import copy
+import numpy as np
+import torch
+
+from pycocotools.cocoeval import COCOeval
+from pycocotools.coco import COCO
+import pycocotools.mask as mask_util
+
+from util.misc import all_gather
+
+
+class CocoEvaluator(object):
+    def __init__(self, coco_gt, iou_types):
+        assert isinstance(iou_types, (list, tuple))
+        coco_gt = copy.deepcopy(coco_gt)
+        self.coco_gt = coco_gt
+
+        self.iou_types = iou_types
+        self.coco_eval = {}
+        for iou_type in iou_types:
+            self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
+
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in iou_types}
+
+    def update(self, predictions):
+        img_ids = list(np.unique(list(predictions.keys())))
+        self.img_ids.extend(img_ids)
+
+        for iou_type in self.iou_types:
+            results = self.prepare(predictions, iou_type)
+
+            # suppress pycocotools prints
+            with open(os.devnull, 'w') as devnull:
+                with contextlib.redirect_stdout(devnull):
+                    coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
+            coco_eval = self.coco_eval[iou_type]
+
+            coco_eval.cocoDt = coco_dt
+            coco_eval.params.imgIds = list(img_ids)
+            img_ids, eval_imgs = evaluate(coco_eval)
+
+            self.eval_imgs[iou_type].append(eval_imgs)
+
+    def synchronize_between_processes(self):
+        for iou_type in self.iou_types:
+            self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
+            create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
+
+    def accumulate(self):
+        for coco_eval in self.coco_eval.values():
+            coco_eval.accumulate()
+
+    def summarize(self):
+        for iou_type, coco_eval in self.coco_eval.items():
+            print("IoU metric: {}".format(iou_type))
+            coco_eval.summarize()
+
+    def prepare(self, predictions, iou_type):
+        if iou_type == "bbox":
+            return self.prepare_for_coco_detection(predictions)
+        elif iou_type == "segm":
+            return self.prepare_for_coco_segmentation(predictions)
+        elif iou_type == "keypoints":
+            return self.prepare_for_coco_keypoint(predictions)
+        else:
+            raise ValueError("Unknown iou type {}".format(iou_type))
+
+    def prepare_for_coco_detection(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "bbox": box,
+                        "score": scores[k],
+                    }
+                    for k, box in enumerate(boxes)
+                ]
+            )
+        return coco_results
+
+    def prepare_for_coco_segmentation(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            scores = prediction["scores"]
+            labels = prediction["labels"]
+            masks = prediction["masks"]
+
+            masks = masks > 0.5
+
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+
+            rles = [
+                mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
+                for mask in masks
+            ]
+            for rle in rles:
+                rle["counts"] = rle["counts"].decode("utf-8")
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "segmentation": rle,
+                        "score": scores[k],
+                    }
+                    for k, rle in enumerate(rles)
+                ]
+            )
+        return coco_results
+
+    def prepare_for_coco_keypoint(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            keypoints = prediction["keypoints"]
+            keypoints = keypoints.flatten(start_dim=1).tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        'keypoints': keypoint,
+                        "score": scores[k],
+                    }
+                    for k, keypoint in enumerate(keypoints)
+                ]
+            )
+        return coco_results
+
+
+def convert_to_xywh(boxes):
+    xmin, ymin, xmax, ymax = boxes.unbind(1)
+    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
+
+
+def merge(img_ids, eval_imgs):
+    all_img_ids = all_gather(img_ids)
+    all_eval_imgs = all_gather(eval_imgs)
+
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+
+    merged_eval_imgs = []
+    for p in all_eval_imgs:
+        merged_eval_imgs.append(p)
+
+    merged_img_ids = np.array(merged_img_ids)
+    merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
+
+    # keep only unique (and in sorted order) images
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+    merged_eval_imgs = merged_eval_imgs[..., idx]
+
+    return merged_img_ids, merged_eval_imgs
+
+
+def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
+    img_ids, eval_imgs = merge(img_ids, eval_imgs)
+    img_ids = list(img_ids)
+    eval_imgs = list(eval_imgs.flatten())
+
+    coco_eval.evalImgs = eval_imgs
+    coco_eval.params.imgIds = img_ids
+    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
+
+
+#################################################################
+# From pycocotools, just removed the prints and fixed
+# a Python3 bug about unicode not defined
+#################################################################
+
+
+def evaluate(self):
+    '''
+    Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+    :return: None
+    '''
+    # tic = time.time()
+    # print('Running per image evaluation...')
+    p = self.params
+    # add backward compatibility if useSegm is specified in params
+    if p.useSegm is not None:
+        p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+        print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
+    # print('Evaluate annotation type *{}*'.format(p.iouType))
+    p.imgIds = list(np.unique(p.imgIds))
+    if p.useCats:
+        p.catIds = list(np.unique(p.catIds))
+    p.maxDets = sorted(p.maxDets)
+    self.params = p
+
+    self._prepare()
+    # loop through images, area range, max detection number
+    catIds = p.catIds if p.useCats else [-1]
+
+    if p.iouType == 'segm' or p.iouType == 'bbox':
+        computeIoU = self.computeIoU
+    elif p.iouType == 'keypoints':
+        computeIoU = self.computeOks
+    self.ious = {
+        (imgId, catId): computeIoU(imgId, catId)
+        for imgId in p.imgIds
+        for catId in catIds}
+
+    evaluateImg = self.evaluateImg
+    maxDet = p.maxDets[-1]
+    evalImgs = [
+        evaluateImg(imgId, catId, areaRng, maxDet)
+        for catId in catIds
+        for areaRng in p.areaRng
+        for imgId in p.imgIds
+    ]
+    # this is NOT in the pycocotools code, but could be done outside
+    evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
+    self._paramsEval = copy.deepcopy(self.params)
+    # toc = time.time()
+    # print('DONE (t={:0.2f}s).'.format(toc-tic))
+    return p.imgIds, evalImgs
+
+#################################################################
+# end of straight copy from pycocotools, just removing the prints
+#################################################################
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/coco_panoptic.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/coco_panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..b24f615c2faa14b422829e2edad996e2b5b84248
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/coco_panoptic.py
@@ -0,0 +1,99 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import json
+from pathlib import Path
+
+import numpy as np
+import torch
+from PIL import Image
+
+from panopticapi.utils import rgb2id
+from util.box_ops import masks_to_boxes
+
+from .coco import make_coco_transforms
+
+
+class CocoPanoptic:
+    def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True):
+        with open(ann_file, 'r') as f:
+            self.coco = json.load(f)
+
+        # sort 'images' field so that they are aligned with 'annotations'
+        # i.e., in alphabetical order
+        self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id'])
+        # sanity check
+        if "annotations" in self.coco:
+            for img, ann in zip(self.coco['images'], self.coco['annotations']):
+                assert img['file_name'][:-4] == ann['file_name'][:-4]
+
+        self.img_folder = img_folder
+        self.ann_folder = ann_folder
+        self.ann_file = ann_file
+        self.transforms = transforms
+        self.return_masks = return_masks
+
+    def __getitem__(self, idx):
+        ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx]
+        img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg')
+        ann_path = Path(self.ann_folder) / ann_info['file_name']
+
+        img = Image.open(img_path).convert('RGB')
+        w, h = img.size
+        if "segments_info" in ann_info:
+            masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
+            masks = rgb2id(masks)
+
+            ids = np.array([ann['id'] for ann in ann_info['segments_info']])
+            masks = masks == ids[:, None, None]
+
+            masks = torch.as_tensor(masks, dtype=torch.uint8)
+            labels = torch.tensor([ann['category_id'] for ann in ann_info['segments_info']], dtype=torch.int64)
+
+        target = {}
+        target['image_id'] = torch.tensor([ann_info['image_id'] if "image_id" in ann_info else ann_info["id"]])
+        if self.return_masks:
+            target['masks'] = masks
+        target['labels'] = labels
+
+        target["boxes"] = masks_to_boxes(masks)
+
+        target['size'] = torch.as_tensor([int(h), int(w)])
+        target['orig_size'] = torch.as_tensor([int(h), int(w)])
+        if "segments_info" in ann_info:
+            for name in ['iscrowd', 'area']:
+                target[name] = torch.tensor([ann[name] for ann in ann_info['segments_info']])
+
+        if self.transforms is not None:
+            img, target = self.transforms(img, target)
+
+        return img, target
+
+    def __len__(self):
+        return len(self.coco['images'])
+
+    def get_height_and_width(self, idx):
+        img_info = self.coco['images'][idx]
+        height = img_info['height']
+        width = img_info['width']
+        return height, width
+
+
+def build(image_set, args):
+    img_folder_root = Path(args.coco_path)
+    ann_folder_root = Path(args.coco_panoptic_path)
+    assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist'
+    assert ann_folder_root.exists(), f'provided COCO path {ann_folder_root} does not exist'
+    mode = 'panoptic'
+    PATHS = {
+        "train": ("train2017", Path("annotations") / f'{mode}_train2017.json'),
+        "val": ("val2017", Path("annotations") / f'{mode}_val2017.json'),
+    }
+
+    img_folder, ann_file = PATHS[image_set]
+    img_folder_path = img_folder_root / img_folder
+    ann_folder = ann_folder_root / f'{mode}_{img_folder}'
+    ann_file = ann_folder_root / ann_file
+
+    dataset = CocoPanoptic(img_folder_path, ann_folder, ann_file,
+                           transforms=make_coco_transforms(image_set), return_masks=args.masks)
+
+    return dataset
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/panoptic_eval.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/panoptic_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cb4f83409046a5c2a87643ee005e52a440aae74
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/panoptic_eval.py
@@ -0,0 +1,44 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import json
+import os
+
+import util.misc as utils
+
+try:
+    from panopticapi.evaluation import pq_compute
+except ImportError:
+    pass
+
+
+class PanopticEvaluator(object):
+    def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"):
+        self.gt_json = ann_file
+        self.gt_folder = ann_folder
+        if utils.is_main_process():
+            if not os.path.exists(output_dir):
+                os.mkdir(output_dir)
+        self.output_dir = output_dir
+        self.predictions = []
+
+    def update(self, predictions):
+        for p in predictions:
+            with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f:
+                f.write(p.pop("png_string"))
+
+        self.predictions += predictions
+
+    def synchronize_between_processes(self):
+        all_predictions = utils.all_gather(self.predictions)
+        merged_predictions = []
+        for p in all_predictions:
+            merged_predictions += p
+        self.predictions = merged_predictions
+
+    def summarize(self):
+        if utils.is_main_process():
+            json_data = {"annotations": self.predictions}
+            predictions_json = os.path.join(self.output_dir, "predictions.json")
+            with open(predictions_json, "w") as f:
+                f.write(json.dumps(json_data))
+            return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir)
+        return None
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/transforms.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..0419a812365f5b2878c1f19daffcedf7d89558a5
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/transforms.py
@@ -0,0 +1,330 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Transforms and data augmentation for both image + bbox.
+"""
+import random
+
+import PIL
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+from PIL import Image, ImageDraw
+from util.box_ops import box_xyxy_to_cxcywh
+from util.misc import interpolate
+import numpy as np
+
+
+def crop(image, target, region):
+    cropped_image = F.crop(image, *region)
+
+    target = target.copy()
+    i, j, h, w = region
+
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor([h, w])
+
+    fields = ["labels", "area", "iscrowd"]
+
+    if "boxes" in target:
+        boxes = target["boxes"]
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+        target["boxes"] = cropped_boxes.reshape(-1, 4)
+        target["area"] = area
+        fields.append("boxes")
+
+    if "masks" in target:
+        # FIXME should we update the area here if there are no boxes?
+        target['masks'] = target['masks'][:, i:i + h, j:j + w]
+        fields.append("masks")
+
+    # remove elements for which the boxes or masks that have zero area
+    if "boxes" in target or "masks" in target:
+        # favor boxes selection when defining which elements to keep
+        # this is compatible with previous implementation
+        if "boxes" in target:
+            cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+            keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+        else:
+            keep = target['masks'].flatten(1).any(1)
+
+        for field in fields:
+            target[field] = target[field][keep]
+
+    return cropped_image, target
+
+
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+
+    w, h = image.size
+
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
+        target["boxes"] = boxes
+
+    if "masks" in target:
+        target['masks'] = target['masks'].flip(-1)
+
+    return flipped_image, target
+
+
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+
+        return (oh, ow)
+
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+
+    # for i in target['boxes']:
+    #     draw=ImageDraw.Draw(image)
+    #     draw.line([(i[0].item(), i[1].item()),(i[2].item(),i[1].item()),
+    #                (i[2].item(), i[3].item()),(i[0].item(),i[3].item()),
+    #                (i[0].item(), i[1].item())], width=2, fill='red')
+    # image.show()
+
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size)
+
+    # rescaled_image = F.resize(image, size=(1280, 720))
+
+    if target is None:
+        return rescaled_image, None
+
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes + torch.as_tensor([200, 200, 200, 200])
+        scaled_boxes = scaled_boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+        target["boxes"] = scaled_boxes
+
+    if "area" in target:
+        area = target["area"]
+        scaled_area = area * (ratio_width * ratio_height)
+        target["area"] = scaled_area
+
+    h, w = size
+    target["size"] = torch.tensor([h, w])
+
+    if "masks" in target:
+        target['masks'] = interpolate(
+            target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
+
+    # print('pad:',target['boxes'])
+    # for i in target['boxes']:
+    #     draw=ImageDraw.Draw(rescaled_image)
+    #     draw.line([(i[0].item(), i[1].item()),(i[2].item(),i[1].item()),
+    #                (i[2].item(), i[3].item()),(i[0].item(),i[3].item()),
+    #                (i[0].item(), i[1].item())], width=2, fill='red')
+    # rescaled_image.show()
+
+    return rescaled_image, target
+
+
+def pad(image, target, padding):
+    # assumes that we only pad on the bottom right corners
+    padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor(padded_image.size[::-1])
+    if "masks" in target:
+        target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
+    return padded_image, target
+
+
+class RandomCrop(object):
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        region = T.RandomCrop.get_params(img, self.size)
+        return crop(img, target, region)
+
+
+class RandomSizeCrop(object):
+    def __init__(self, min_size: int, max_size: int):
+        self.min_size = min_size
+        self.max_size = max_size
+
+    def __call__(self, img: PIL.Image.Image, target: dict):
+        w = random.randint(self.min_size, min(img.width, self.max_size))
+        h = random.randint(self.min_size, min(img.height, self.max_size))
+        region = T.RandomCrop.get_params(img, [h, w])
+        return crop(img, target, region)
+
+
+class CenterCrop(object):
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        image_width, image_height = img.size
+        crop_height, crop_width = self.size
+        crop_top = int(round((image_height - crop_height) / 2.))
+        crop_left = int(round((image_width - crop_width) / 2.))
+        return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
+
+
+class RandomHorizontalFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return hflip(img, target)
+        return img, target
+
+
+class RandomResize(object):
+    def __init__(self, sizes, max_size=None):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+        self.max_size = max_size
+
+    def __call__(self, img, target=None):
+        size = random.choice(self.sizes)
+        return resize(img, target, size, self.max_size)
+
+
+class pad_resize(object):
+    def __init__(self, sizes=None):
+        # assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+
+    def __call__(self, img, target=None):
+
+        img, target = Pad_img(img, target)
+        return resize(img, target, size=(1280, 1280))
+
+
+def Pad_img(image, target):
+    # assumes that we only pad on the bottom right corners
+
+    h, w = image.size
+    pad_value = int(abs(h - w) / 2)
+
+    if h > w:
+        padded_image = F.pad(image, (0, pad_value, 0, pad_value), fill=0)
+    else:
+        padded_image = F.pad(image, (pad_value, 0, pad_value, 0), fill=0)
+    h_, w_ = padded_image.size
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes + torch.as_tensor([abs(h - h_) / 2, abs(w - w_) / 2, abs(h - h_) / 2, abs(w - w_) / 2])
+        target["boxes"] = scaled_boxes
+
+    if target is None:
+        return padded_image, None
+
+    target["size"] = torch.tensor([h_, w_])
+
+    return padded_image, target
+
+
+class RandomPad(object):
+    def __init__(self, max_pad):
+        self.max_pad = max_pad
+
+    def __call__(self, img, target):
+        pad_x = random.randint(0, self.max_pad)
+        pad_y = random.randint(0, self.max_pad)
+        return pad(img, target, (pad_x, pad_y))
+
+
+class RandomSelect(object):
+    """
+    Randomly selects between transforms1 and transforms2,
+    with probability p for transforms1 and (1 - p) for transforms2
+    """
+
+    def __init__(self, transforms1, transforms2, p=0.5):
+        self.transforms1 = transforms1
+        self.transforms2 = transforms2
+        self.p = p
+
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return self.transforms1(img, target)
+        return self.transforms2(img, target)
+
+
+class ToTensor(object):
+    def __call__(self, img, target):
+        return F.to_tensor(img), target
+
+
+class RandomErasing(object):
+
+    def __init__(self, *args, **kwargs):
+        self.eraser = T.RandomErasing(*args, **kwargs)
+
+    def __call__(self, img, target):
+        return self.eraser(img), target
+
+
+class Normalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, image, target=None):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        if target is None:
+            return image, None
+        target = target.copy()
+        h, w = image.shape[-2:]
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = box_xyxy_to_cxcywh(boxes)
+            boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
+            target["boxes"] = boxes
+        return image, target
+
+
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/engine.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1a78c8342468ebee4116a72630ecc5b2147679a
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/engine.py
@@ -0,0 +1,153 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Train and eval functions used in main.py
+"""
+import math
+import os
+import sys
+from typing import Iterable
+
+import torch
+from apex import amp
+import util.misc as utils
+from datasets.coco_eval import CocoEvaluator
+from datasets.panoptic_eval import PanopticEvaluator
+import time
+
+def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
+                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
+                    device: torch.device, batch_size: int, epoch: int, max_norm: float = 0):
+
+    model.train()
+    criterion.train()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+    print_freq = 1
+
+    for samples, targets in metric_logger.log_every(data_loader, batch_size, print_freq, header):
+        optimizer.zero_grad()
+        samples = samples.to(device)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        outputs = model(samples)
+
+        loss_dict = criterion(outputs, targets)
+        weight_dict = criterion.weight_dict
+        losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+
+        # reduce losses over all GPUs for logging purposes
+        loss_dict_reduced = utils.reduce_dict(loss_dict)
+        loss_dict_reduced_unscaled = {f'{k}_unscaled': v
+                                      for k, v in loss_dict_reduced.items()}
+        loss_dict_reduced_scaled = {k: v * weight_dict[k]
+                                    for k, v in loss_dict_reduced.items() if k in weight_dict}
+        losses_reduced_scaled = sum(loss_dict_reduced_scaled.values())
+
+        loss_value = losses_reduced_scaled.item()
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            print(loss_dict_reduced)
+            sys.exit(1)
+
+        with amp.scale_loss(losses, optimizer) as scaled_loss:
+            scaled_loss.backward()
+        # losses.backward()
+        if max_norm > 0:
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+        optimizer.step()
+        metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled)
+        metric_logger.update(class_error=loss_dict_reduced['class_error'])
+        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+@torch.no_grad()
+def evaluate(model, criterion, postprocessors, data_loader, base_ds, device, output_dir):
+    model.eval()
+    criterion.eval()
+
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}'))
+    header = 'Test:'
+
+    iou_types = tuple(k for k in ('segm', 'bbox') if k in postprocessors.keys())
+    coco_evaluator = CocoEvaluator(base_ds, iou_types)
+    # coco_evaluator.coco_eval[iou_types[0]].params.iouThrs = [0, 0.1, 0.5, 0.75]
+
+    panoptic_evaluator = None
+    if 'panoptic' in postprocessors.keys():
+        panoptic_evaluator = PanopticEvaluator(
+            data_loader.dataset.ann_file,
+            data_loader.dataset.ann_folder,
+            output_dir=os.path.join(output_dir, "panoptic_eval"),
+        )
+    # ort_session = onnxruntime.InferenceSession('model_file/detr_640.onnx')
+    for samples, targets in metric_logger.log_every(data_loader, 10, header):
+        samples = samples.to(device)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        outputs = model(samples)
+        loss_dict = criterion(outputs, targets)
+        weight_dict = criterion.weight_dict
+
+        # reduce losses over all GPUs for logging purposes
+        loss_dict_reduced = utils.reduce_dict(loss_dict)
+        loss_dict_reduced_scaled = {k: v * weight_dict[k]
+                                    for k, v in loss_dict_reduced.items() if k in weight_dict}
+        loss_dict_reduced_unscaled = {f'{k}_unscaled': v
+                                      for k, v in loss_dict_reduced.items()}
+        metric_logger.update(loss=sum(loss_dict_reduced_scaled.values()),
+                             **loss_dict_reduced_scaled,
+                             **loss_dict_reduced_unscaled)
+        metric_logger.update(class_error=loss_dict_reduced['class_error'])
+
+        orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
+        results = postprocessors['bbox'](outputs, orig_target_sizes)
+        if 'segm' in postprocessors.keys():
+            target_sizes = torch.stack([t["size"] for t in targets], dim=0)
+            results = postprocessors['segm'](results, outputs, orig_target_sizes, target_sizes)
+        res = {target['image_id'].item(): output for target, output in zip(targets, results)}
+        if coco_evaluator is not None:
+            coco_evaluator.update(res)
+
+        if panoptic_evaluator is not None:
+            res_pano = postprocessors["panoptic"](outputs, target_sizes, orig_target_sizes)
+            for i, target in enumerate(targets):
+                image_id = target["image_id"].item()
+                file_name = f"{image_id:012d}.png"
+                res_pano[i]["image_id"] = image_id
+                res_pano[i]["file_name"] = file_name
+
+            panoptic_evaluator.update(res_pano)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    if coco_evaluator is not None:
+        coco_evaluator.synchronize_between_processes()
+    if panoptic_evaluator is not None:
+        panoptic_evaluator.synchronize_between_processes()
+
+    # accumulate predictions from all images
+    if coco_evaluator is not None:
+        coco_evaluator.accumulate()
+        coco_evaluator.summarize()
+    panoptic_res = None
+    if panoptic_evaluator is not None:
+        panoptic_res = panoptic_evaluator.summarize()
+    stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    if coco_evaluator is not None:
+        if 'bbox' in postprocessors.keys():
+            stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist()
+        if 'segm' in postprocessors.keys():
+            stats['coco_eval_masks'] = coco_evaluator.coco_eval['segm'].stats.tolist()
+    if panoptic_res is not None:
+        stats['PQ_all'] = panoptic_res["All"]
+        stats['PQ_th'] = panoptic_res["Things"]
+        stats['PQ_st'] = panoptic_res["Stuff"]
+    return stats, coco_evaluator
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/hubconf.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/hubconf.py
new file mode 100644
index 0000000000000000000000000000000000000000..328c3306d03dc65b71898ead44b2b3f8164de4a0
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/hubconf.py
@@ -0,0 +1,168 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+
+from models.backbone import Backbone, Joiner
+from models.detr import DETR, PostProcess
+from models.position_encoding import PositionEmbeddingSine
+from models.segmentation import DETRsegm, PostProcessPanoptic
+from models.transformer import Transformer
+
+dependencies = ["torch", "torchvision"]
+
+
+def _make_detr(backbone_name: str, dilation=False, num_classes=91, mask=False):
+    hidden_dim = 256
+    backbone = Backbone(backbone_name, train_backbone=True, return_interm_layers=mask, dilation=dilation)
+    pos_enc = PositionEmbeddingSine(hidden_dim // 2, normalize=True)
+    backbone_with_pos_enc = Joiner(backbone, pos_enc)
+    backbone_with_pos_enc.num_channels = backbone.num_channels
+    transformer = Transformer(d_model=hidden_dim, return_intermediate_dec=True)
+    detr = DETR(backbone_with_pos_enc, transformer, num_classes=num_classes, num_queries=100)
+    if mask:
+        return DETRsegm(detr)
+    return detr
+
+
+def detr_resnet50(pretrained=False, num_classes=91, return_postprocessor=False):
+    """
+    DETR R50 with 6 encoder and 6 decoder layers.
+
+    Achieves 42/62.4 AP/AP50 on COCO val5k.
+    """
+    model = _make_detr("resnet50", dilation=False, num_classes=num_classes)
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            url="https://dl.fbaipublicfiles.com/detr/detr-r50-e632da11.pth", map_location="cpu", check_hash=True
+        )
+        model.load_state_dict(checkpoint["model"])
+    if return_postprocessor:
+        return model, PostProcess()
+    return model
+
+
+def detr_resnet50_dc5(pretrained=False, num_classes=91, return_postprocessor=False):
+    """
+    DETR-DC5 R50 with 6 encoder and 6 decoder layers.
+
+    The last block of ResNet-50 has dilation to increase
+    output resolution.
+    Achieves 43.3/63.1 AP/AP50 on COCO val5k.
+    """
+    model = _make_detr("resnet50", dilation=True, num_classes=num_classes)
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            url="https://dl.fbaipublicfiles.com/detr/detr-r50-dc5-f0fb7ef5.pth", map_location="cpu", check_hash=True
+        )
+        model.load_state_dict(checkpoint["model"])
+    if return_postprocessor:
+        return model, PostProcess()
+    return model
+
+
+def detr_resnet101(pretrained=False, num_classes=91, return_postprocessor=False):
+    """
+    DETR-DC5 R101 with 6 encoder and 6 decoder layers.
+
+    Achieves 43.5/63.8 AP/AP50 on COCO val5k.
+    """
+    model = _make_detr("resnet101", dilation=False, num_classes=num_classes)
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            url="https://dl.fbaipublicfiles.com/detr/detr-r101-2c7b67e5.pth", map_location="cpu", check_hash=True
+        )
+        model.load_state_dict(checkpoint["model"])
+    if return_postprocessor:
+        return model, PostProcess()
+    return model
+
+
+def detr_resnet101_dc5(pretrained=False, num_classes=91, return_postprocessor=False):
+    """
+    DETR-DC5 R101 with 6 encoder and 6 decoder layers.
+
+    The last block of ResNet-101 has dilation to increase
+    output resolution.
+    Achieves 44.9/64.7 AP/AP50 on COCO val5k.
+    """
+    model = _make_detr("resnet101", dilation=True, num_classes=num_classes)
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            url="https://dl.fbaipublicfiles.com/detr/detr-r101-dc5-a2e86def.pth", map_location="cpu", check_hash=True
+        )
+        model.load_state_dict(checkpoint["model"])
+    if return_postprocessor:
+        return model, PostProcess()
+    return model
+
+
+def detr_resnet50_panoptic(
+    pretrained=False, num_classes=250, threshold=0.85, return_postprocessor=False
+):
+    """
+    DETR R50 with 6 encoder and 6 decoder layers.
+    Achieves 43.4 PQ on COCO val5k.
+
+   threshold is the minimum confidence required for keeping segments in the prediction
+    """
+    model = _make_detr("resnet50", dilation=False, num_classes=num_classes, mask=True)
+    is_thing_map = {i: i <= 90 for i in range(250)}
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            url="https://dl.fbaipublicfiles.com/detr/detr-r50-panoptic-00ce5173.pth",
+            map_location="cpu",
+            check_hash=True,
+        )
+        model.load_state_dict(checkpoint["model"])
+    if return_postprocessor:
+        return model, PostProcessPanoptic(is_thing_map, threshold=threshold)
+    return model
+
+
+def detr_resnet50_dc5_panoptic(
+    pretrained=False, num_classes=250, threshold=0.85, return_postprocessor=False
+):
+    """
+    DETR-DC5 R50 with 6 encoder and 6 decoder layers.
+
+    The last block of ResNet-50 has dilation to increase
+    output resolution.
+    Achieves 44.6 on COCO val5k.
+
+   threshold is the minimum confidence required for keeping segments in the prediction
+    """
+    model = _make_detr("resnet50", dilation=True, num_classes=num_classes, mask=True)
+    is_thing_map = {i: i <= 90 for i in range(250)}
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            url="https://dl.fbaipublicfiles.com/detr/detr-r50-dc5-panoptic-da08f1b1.pth",
+            map_location="cpu",
+            check_hash=True,
+        )
+        model.load_state_dict(checkpoint["model"])
+    if return_postprocessor:
+        return model, PostProcessPanoptic(is_thing_map, threshold=threshold)
+    return model
+
+
+def detr_resnet101_panoptic(
+    pretrained=False, num_classes=250, threshold=0.85, return_postprocessor=False
+):
+    """
+    DETR-DC5 R101 with 6 encoder and 6 decoder layers.
+
+    Achieves 45.1 PQ on COCO val5k.
+
+   threshold is the minimum confidence required for keeping segments in the prediction
+    """
+    model = _make_detr("resnet101", dilation=False, num_classes=num_classes, mask=True)
+    is_thing_map = {i: i <= 90 for i in range(250)}
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            url="https://dl.fbaipublicfiles.com/detr/detr-r101-panoptic-40021d53.pth",
+            map_location="cpu",
+            check_hash=True,
+        )
+        model.load_state_dict(checkpoint["model"])
+    if return_postprocessor:
+        return model, PostProcessPanoptic(is_thing_map, threshold=threshold)
+    return model
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__init__.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3f26531befaf6abb215e48a0ef4bfc3da1c7c04
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .detr import build
+
+
+def build_model(args):
+    return build(args)
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/__init__.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/__init__.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..412367e3f4bbf044d602c58174d344e91abcd382
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/__init__.cpython-36.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/__init__.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a9664cfcc4178ecfaef6baa75383eab6cf2f20a
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/__init__.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/backbone.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/backbone.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de63f498e3348a44805c347917b7e681251135f8
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/backbone.cpython-36.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/backbone.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/backbone.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d72cf5d1aa2f98e62ec7462b03e9f0bf9f53edc3
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/backbone.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/detr.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/detr.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce70cc4869f118856944389e0a555174d5bf678e
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/detr.cpython-36.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/detr.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/detr.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4739f63cb5dd95eccfaf35b4aadc85493664cc9f
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/detr.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/matcher.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/matcher.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0adfb0bcbde625e011cf77144d73aa22397f8604
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/matcher.cpython-36.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/matcher.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/matcher.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f51dca35d0b794935a0b0a263c79cc040c87a4a
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/matcher.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/position_encoding.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/position_encoding.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d4b076f3c107d8a1804b6192ae965dc596ceb988
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/position_encoding.cpython-36.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/position_encoding.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/position_encoding.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7aa0717efd318b39a44233205b2c25f204707e3a
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/position_encoding.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/segmentation.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/segmentation.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64ceb75c03bb4c1cfaca8528f33cfbadbe2d833e
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/segmentation.cpython-36.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/segmentation.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/segmentation.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..078aea0bdc5837558360986b882ca5ef5a427baa
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/segmentation.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/transformer.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/transformer.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1141c355a432b454a34b445858d828194293c4bb
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/transformer.cpython-36.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/transformer.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/transformer.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e00b3332583fa7bb3f462d40de05091169ef664
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/transformer.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/backbone.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..9976e2e3586b6b32f878e0c8ecbdabc4dea18938
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/backbone.py
@@ -0,0 +1,118 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Backbone modules.
+"""
+from collections import OrderedDict
+
+import torch
+import torch.nn.functional as F
+import torchvision
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+from typing import Dict, List
+
+from util.misc import NestedTensor, is_main_process
+
+from .position_encoding import build_position_encoding
+
+
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+
+
+class BackboneBase(nn.Module):
+
+    def __init__(self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_layers: bool):
+        super().__init__()
+        for name, parameter in backbone.named_parameters():
+            if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
+                parameter.requires_grad_(False)
+        if return_interm_layers:
+            return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
+        else:
+            return_layers = {'layer4': "0"}
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.num_channels = num_channels
+
+    def forward(self, tensor_list: NestedTensor):
+        xs = self.body(tensor_list.tensors)
+        out: Dict[str, NestedTensor] = {}
+        for name, x in xs.items():
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+            out[name] = NestedTensor(x, mask)
+        return out
+
+
+class Backbone(BackboneBase):
+    """ResNet backbone with frozen BatchNorm."""
+    def __init__(self, name: str,
+                 train_backbone: bool,
+                 return_interm_layers: bool,
+                 dilation: bool):
+        backbone = getattr(torchvision.models, name)(
+            replace_stride_with_dilation=[False, False, dilation],
+            pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d)
+        num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
+        super().__init__(backbone, train_backbone, num_channels, return_interm_layers)
+
+
+class Joiner(nn.Sequential):
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+
+    def forward(self, tensor_list: NestedTensor):
+        xs = self[0](tensor_list)
+        out: List[NestedTensor] = []
+        pos = []
+        for name, x in xs.items():
+            out.append(x)
+            # position encoding
+            pos.append(self[1](x).to(x.tensors.dtype))
+        return out, pos
+
+
+def build_backbone(args):
+    position_embedding = build_position_encoding(args)
+    train_backbone = args.lr_backbone > 0
+    return_interm_layers = args.masks
+    backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation)
+    model = Joiner(backbone, position_embedding)
+    model.num_channels = backbone.num_channels
+    return model
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/detr.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..44209c82421e3dbad7c59514a7c4b21030a46d32
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/detr.py
@@ -0,0 +1,368 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+DETR model and criterion classes.
+"""
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from util import box_ops
+from util.misc import (NestedTensor, nested_tensor_from_tensor_list,_onnx_nested_tensor_from_tensor_list,
+                       accuracy, get_world_size, interpolate,
+                       is_dist_avail_and_initialized)
+
+from .backbone import build_backbone
+from .matcher import build_matcher
+from .segmentation import (DETRsegm, PostProcessPanoptic, PostProcessSegm,
+                           dice_loss, sigmoid_focal_loss)
+from .transformer import build_transformer
+
+
+class DETR(nn.Module):
+    """ This is the DETR module that performs object detection """
+    def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=False):
+        """ Initializes the model.
+        Parameters:
+            backbone: torch module of the backbone to be used. See backbone.py
+            transformer: torch module of the transformer architecture. See transformer.py
+            num_classes: number of object classes
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         DETR can detect in a single image. For COCO, we recommend 100 queries.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+        """
+        super().__init__()
+        self.num_queries = num_queries
+        self.transformer = transformer
+        hidden_dim = transformer.d_model
+        self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
+        self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+        self.input_proj = nn.Conv2d(backbone.num_channels, hidden_dim, kernel_size=1)
+        self.backbone = backbone
+        self.aux_loss = aux_loss
+
+    def forward(self, samples: NestedTensor):
+        """ The forward expects a NestedTensor, which consists of:
+               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
+               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
+
+            It returns a dict with the following elements:
+               - "pred_logits": the classification logits (including no-object) for all queries.
+                                Shape= [batch_size x num_queries x (num_classes + 1)]
+               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
+                               (center_x, center_y, height, width). These values are normalized in [0, 1],
+                               relative to the size of each individual image (disregarding possible padding).
+                               See PostProcess for information on how to retrieve the unnormalized bounding box.
+               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
+                                dictionnaries containing the two above keys for each decoder layer.
+        """
+        if isinstance(samples, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(samples)
+        features, pos = self.backbone(samples)
+        src, mask = features[-1].decompose()
+        assert mask is not None
+        hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0]
+        outputs_class = self.class_embed(hs)
+        outputs_coord = self.bbox_embed(hs).sigmoid()
+        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}
+        if self.aux_loss:
+            out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord)
+        return out
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{'pred_logits': a, 'pred_boxes': b}
+                for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+
+
+class SetCriterion(nn.Module):
+    """ This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses):
+        """ Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.eos_coef = eos_coef
+        self.losses = losses
+        empty_weight = torch.ones(self.num_classes + 1)
+        empty_weight[-1] = self.eos_coef
+        self.register_buffer('empty_weight', empty_weight)
+
+    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J].to('cpu') for t, (_, J) in zip(targets, indices)]).to('npu')
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+
+        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
+        losses = {'loss_ce': loss_ce}
+
+        if log:
+            # TODO this should probably be a separate loss, not hacked in this one here
+            losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0]
+        return losses
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
+        """
+        pred_logits = outputs['pred_logits']
+        device = pred_logits.device
+        tgt_lengths = torch.as_tensor([len(v["labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1)
+        card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
+        losses = {'cardinality_error': card_err}
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs['pred_boxes'][idx]
+        target_boxes = torch.cat([t['boxes'][i].to('cpu') for t, (_, i) in zip(targets, indices)], dim=0).to('npu')
+
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
+
+        losses = {}
+        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(box_ops.generalized_box_iou(
+            box_ops.box_cxcywh_to_xyxy(src_boxes),
+            box_ops.box_cxcywh_to_xyxy(target_boxes)))
+        losses['loss_giou'] = loss_giou.sum() / num_boxes
+        return losses
+
+    def loss_masks(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the masks: the focal loss and the dice loss.
+           targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
+        """
+        assert "pred_masks" in outputs
+
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        src_masks = src_masks[src_idx]
+        masks = [t["masks"] for t in targets]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]
+
+        # upsample predictions to the target size
+        src_masks = interpolate(src_masks[:, None], size=target_masks.shape[-2:],
+                                mode="bilinear", align_corners=False)
+        src_masks = src_masks[:, 0].flatten(1)
+
+        target_masks = target_masks.flatten(1)
+        target_masks = target_masks.view(src_masks.shape)
+        losses = {
+            "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes),
+            "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
+        }
+        return losses
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            'labels': self.loss_labels,
+            'cardinality': self.loss_cardinality,
+            'boxes': self.loss_boxes,
+            'masks': self.loss_masks
+        }
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
+
+    def forward(self, outputs, targets):
+        """ This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_boxes)
+        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if 'aux_outputs' in outputs:
+            for i, aux_outputs in enumerate(outputs['aux_outputs']):
+                indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    if loss == 'masks':
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    kwargs = {}
+                    if loss == 'labels':
+                        # Logging is enabled only for the last layer
+                        kwargs = {'log': False}
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
+                    l_dict = {k + f'_{i}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        return losses
+
+
+class PostProcess(nn.Module):
+    """ This module converts the model's output into the format expected by the coco api"""
+    @torch.no_grad()
+    def forward(self, outputs, target_sizes):
+        """ Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
+                          For evaluation, this must be the original image size (before any data augmentation)
+                          For visualization, this should be the image size after data augment, but before padding
+        """
+        out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
+
+        assert len(out_logits) == len(target_sizes)
+        assert target_sizes.shape[1] == 2
+
+        prob = F.softmax(out_logits, -1)
+        scores, labels = prob[..., :-1].max(-1)
+
+        # convert to [x0, y0, x1, y1] format
+        boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        sc = torch.ones_like(scale_fct)
+        value = torch.max(scale_fct.half(), dim=1)[0]
+        value = torch.unsqueeze(value, dim=0).t()
+        scale_fct_value = sc * value
+        sc_value = (scale_fct_value - scale_fct) / 2
+        sc_ex = torch.unsqueeze(sc_value, dim=1)
+        boxes = boxes * scale_fct_value[:, None, :]
+        boxes_one = torch.ones_like(boxes)
+        boxex_ten = boxes_one * sc_ex
+        boxes = boxes - boxex_ten
+
+        results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+def build(args):
+    # the `num_classes` naming here is somewhat misleading.
+    # it indeed corresponds to `max_obj_id + 1`, where max_obj_id
+    # is the maximum id for a class in your dataset. For example,
+    # COCO has a max_obj_id of 90, so we pass `num_classes` to be 91.
+    # As another example, for a dataset that has a single class with id 1,
+    # you should pass `num_classes` to be 2 (max_obj_id + 1).
+    # For more details on this, check the following discussion
+    # https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223
+    num_classes = 20 if args.dataset_file != 'coco' else 91
+    if args.dataset_file == "coco_panoptic":
+        # for panoptic, we just add a num_classes that is large enough to hold
+        # max_obj_id + 1, but the exact value doesn't really matter
+        num_classes = 250
+    device = torch.device(args.device)
+
+    backbone = build_backbone(args)
+
+    transformer = build_transformer(args)
+
+    model = DETR(
+        backbone,
+        transformer,
+        num_classes=num_classes,
+        num_queries=args.num_queries,
+        aux_loss=args.aux_loss,
+    )
+    if args.masks:
+        model = DETRsegm(model, freeze_detr=(args.frozen_weights is not None))
+    matcher = build_matcher(args)
+    weight_dict = {'loss_ce': 1, 'loss_bbox': args.bbox_loss_coef}
+    weight_dict['loss_giou'] = args.giou_loss_coef
+    if args.masks:
+        weight_dict["loss_mask"] = args.mask_loss_coef
+        weight_dict["loss_dice"] = args.dice_loss_coef
+    # TODO this is a hack
+    if args.aux_loss:
+        aux_weight_dict = {}
+        for i in range(args.dec_layers - 1):
+            aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()})
+        weight_dict.update(aux_weight_dict)
+
+    losses = ['labels', 'boxes', 'cardinality']
+    if args.masks:
+        losses += ["masks"]
+    print('losses',losses)
+
+    criterion = SetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict,
+                             eos_coef=args.eos_coef, losses=losses)
+    criterion.to(device)
+    postprocessors = {'bbox': PostProcess()}
+    if args.masks:
+        postprocessors['segm'] = PostProcessSegm()
+        if args.dataset_file == "coco_panoptic":
+            is_thing_map = {i: i <= 90 for i in range(201)}
+            postprocessors["panoptic"] = PostProcessPanoptic(is_thing_map, threshold=0.85)
+
+    return model, criterion, postprocessors
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/matcher.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/matcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a4483e4c49d2d679bb48eef87a642695be05ba3
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/matcher.py
@@ -0,0 +1,95 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Modules to compute the matching cost and solve the corresponding LSAP.
+"""
+import torch
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+
+from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
+
+
+class HungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+
+    def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1):
+        """Creates the matcher
+
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
+            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
+        """
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_bbox = cost_bbox
+        self.cost_giou = cost_giou
+        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """ Performs the matching
+
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
+
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        bs, num_queries = outputs["pred_logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+
+        targets_labels = [v["labels"].to('cpu') for v in targets]
+        targets_boxes = [v["boxes"].to('cpu') for v in targets]
+
+        tgt_ids = torch.cat(targets_labels).to('npu')
+        tgt_bbox = torch.cat(targets_boxes).to('npu')
+
+        # tgt_ids = torch.cat([v["labels"] for v in targets])
+        # tgt_bbox = torch.cat([v["boxes"] for v in targets])
+
+
+        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+        # but approximate it in 1 - proba[target class].
+        # The 1 is a constant that doesn't change the matching, it can be ommitted.
+        cost_class = -out_prob[:, tgt_ids]
+
+        # Compute the L1 cost between boxes
+        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
+
+        # Compute the giou cost betwen boxes
+
+        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
+
+        # Final cost matrix
+        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
+        C = C.view(bs, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
+def build_matcher(args):
+    return HungarianMatcher(cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, cost_giou=args.set_cost_giou)
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/position_encoding.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/position_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..6afc9190d485dc8b94d623fa398d37aba8a47b9f
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/position_encoding.py
@@ -0,0 +1,91 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Various positional encodings for the transformer.
+"""
+import math
+import torch
+from torch import nn
+
+from util.misc import NestedTensor
+
+
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        # not_mask = ~mask
+        not_mask = (~mask).float()
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+class PositionEmbeddingLearned(nn.Module):
+    """
+    Absolute pos embedding, learned.
+    """
+    def __init__(self, num_pos_feats=256):
+        super().__init__()
+        self.row_embed = nn.Embedding(50, num_pos_feats)
+        self.col_embed = nn.Embedding(50, num_pos_feats)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.uniform_(self.row_embed.weight)
+        nn.init.uniform_(self.col_embed.weight)
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        h, w = x.shape[-2:]
+        i = torch.arange(w, device=x.device)
+        j = torch.arange(h, device=x.device)
+        x_emb = self.col_embed(i)
+        y_emb = self.row_embed(j)
+
+        pos = torch.cat([
+            x_emb.unsqueeze(0).repeat(h, 1, 1),
+            y_emb.unsqueeze(1).repeat(1, w, 1),
+        ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
+        return pos
+
+
+def build_position_encoding(args):
+    N_steps = args.hidden_dim // 2
+    if args.position_embedding in ('v2', 'sine'):
+        # TODO find a better way of exposing other arguments
+        position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
+    elif args.position_embedding in ('v3', 'learned'):
+        position_embedding = PositionEmbeddingLearned(N_steps)
+    else:
+        raise ValueError(f"not supported {args.position_embedding}")
+
+    return position_embedding
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/segmentation.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/segmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..01faa8851838661a930440b5f6ccf68ca2e6fb8d
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/segmentation.py
@@ -0,0 +1,363 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+This file provides the definition of the convolutional heads used to predict masks, as well as the losses
+"""
+import io
+from collections import defaultdict
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from PIL import Image
+
+import util.box_ops as box_ops
+from util.misc import NestedTensor, interpolate, nested_tensor_from_tensor_list
+
+try:
+    from panopticapi.utils import id2rgb, rgb2id
+except ImportError:
+    pass
+
+
+class DETRsegm(nn.Module):
+    def __init__(self, detr, freeze_detr=False):
+        super().__init__()
+        self.detr = detr
+
+        if freeze_detr:
+            for p in self.parameters():
+                p.requires_grad_(False)
+
+        hidden_dim, nheads = detr.transformer.d_model, detr.transformer.nhead
+        self.bbox_attention = MHAttentionMap(hidden_dim, hidden_dim, nheads, dropout=0.0)
+        self.mask_head = MaskHeadSmallConv(hidden_dim + nheads, [1024, 512, 256], hidden_dim)
+
+    def forward(self, samples: NestedTensor):
+        if isinstance(samples, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(samples)
+        features, pos = self.detr.backbone(samples)
+
+        bs = features[-1].tensors.shape[0]
+
+        src, mask = features[-1].decompose()
+        assert mask is not None
+        src_proj = self.detr.input_proj(src)
+        hs, memory = self.detr.transformer(src_proj, mask, self.detr.query_embed.weight, pos[-1])
+
+        outputs_class = self.detr.class_embed(hs)
+        outputs_coord = self.detr.bbox_embed(hs).sigmoid()
+        out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1]}
+        if self.detr.aux_loss:
+            out['aux_outputs'] = self.detr._set_aux_loss(outputs_class, outputs_coord)
+
+        # FIXME h_boxes takes the last one computed, keep this in mind
+        bbox_mask = self.bbox_attention(hs[-1], memory, mask=mask)
+
+        seg_masks = self.mask_head(src_proj, bbox_mask, [features[2].tensors, features[1].tensors, features[0].tensors])
+        outputs_seg_masks = seg_masks.view(bs, self.detr.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
+
+        out["pred_masks"] = outputs_seg_masks
+        return out
+
+
+def _expand(tensor, length: int):
+    return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1)
+
+
+class MaskHeadSmallConv(nn.Module):
+    """
+    Simple convolutional head, using group norm.
+    Upsampling is done using a FPN approach
+    """
+
+    def __init__(self, dim, fpn_dims, context_dim):
+        super().__init__()
+
+        inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
+        self.lay1 = torch.nn.Conv2d(dim, dim, 3, padding=1)
+        self.gn1 = torch.nn.GroupNorm(8, dim)
+        self.lay2 = torch.nn.Conv2d(dim, inter_dims[1], 3, padding=1)
+        self.gn2 = torch.nn.GroupNorm(8, inter_dims[1])
+        self.lay3 = torch.nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
+        self.gn3 = torch.nn.GroupNorm(8, inter_dims[2])
+        self.lay4 = torch.nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
+        self.gn4 = torch.nn.GroupNorm(8, inter_dims[3])
+        self.lay5 = torch.nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
+        self.gn5 = torch.nn.GroupNorm(8, inter_dims[4])
+        self.out_lay = torch.nn.Conv2d(inter_dims[4], 1, 3, padding=1)
+
+        self.dim = dim
+
+        self.adapter1 = torch.nn.Conv2d(fpn_dims[0], inter_dims[1], 1)
+        self.adapter2 = torch.nn.Conv2d(fpn_dims[1], inter_dims[2], 1)
+        self.adapter3 = torch.nn.Conv2d(fpn_dims[2], inter_dims[3], 1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, a=1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]):
+        x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1)
+
+        x = self.lay1(x)
+        x = self.gn1(x)
+        x = F.relu(x)
+        x = self.lay2(x)
+        x = self.gn2(x)
+        x = F.relu(x)
+
+        cur_fpn = self.adapter1(fpns[0])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay3(x)
+        x = self.gn3(x)
+        x = F.relu(x)
+
+        cur_fpn = self.adapter2(fpns[1])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay4(x)
+        x = self.gn4(x)
+        x = F.relu(x)
+
+        cur_fpn = self.adapter3(fpns[2])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay5(x)
+        x = self.gn5(x)
+        x = F.relu(x)
+
+        x = self.out_lay(x)
+        return x
+
+
+class MHAttentionMap(nn.Module):
+    """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
+
+    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.dropout = nn.Dropout(dropout)
+
+        self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+        self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+
+        nn.init.zeros_(self.k_linear.bias)
+        nn.init.zeros_(self.q_linear.bias)
+        nn.init.xavier_uniform_(self.k_linear.weight)
+        nn.init.xavier_uniform_(self.q_linear.weight)
+        self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5
+
+    def forward(self, q, k, mask: Optional[Tensor] = None):
+        q = self.q_linear(q)
+        k = F.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
+        qh = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
+        kh = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])
+        weights = torch.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
+
+        if mask is not None:
+            weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf"))
+        weights = F.softmax(weights.flatten(2), dim=-1).view(weights.size())
+        weights = self.dropout(weights)
+        return weights
+
+
+def dice_loss(inputs, targets, num_boxes):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_boxes
+
+
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.mean(1).sum() / num_boxes
+
+
+class PostProcessSegm(nn.Module):
+    def __init__(self, threshold=0.5):
+        super().__init__()
+        self.threshold = threshold
+
+    @torch.no_grad()
+    def forward(self, results, outputs, orig_target_sizes, max_target_sizes):
+        assert len(orig_target_sizes) == len(max_target_sizes)
+        max_h, max_w = max_target_sizes.max(0)[0].tolist()
+        outputs_masks = outputs["pred_masks"].squeeze(2)
+        outputs_masks = F.interpolate(outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False)
+        outputs_masks = (outputs_masks.sigmoid() > self.threshold).cpu()
+
+        for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
+            img_h, img_w = t[0], t[1]
+            results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
+            results[i]["masks"] = F.interpolate(
+                results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest"
+            ).byte()
+
+        return results
+
+
+class PostProcessPanoptic(nn.Module):
+    """This class converts the output of the model to the final panoptic result, in the format expected by the
+    coco panoptic API """
+
+    def __init__(self, is_thing_map, threshold=0.85):
+        """
+        Parameters:
+           is_thing_map: This is a whose keys are the class ids, and the values a boolean indicating whether
+                          the class is  a thing (True) or a stuff (False) class
+           threshold: confidence threshold: segments with confidence lower than this will be deleted
+        """
+        super().__init__()
+        self.threshold = threshold
+        self.is_thing_map = is_thing_map
+
+    def forward(self, outputs, processed_sizes, target_sizes=None):
+        """ This function computes the panoptic prediction from the model's predictions.
+        Parameters:
+            outputs: This is a dict coming directly from the model. See the model doc for the content.
+            processed_sizes: This is a list of tuples (or torch tensors) of sizes of the images that were passed to the
+                             model, ie the size after data augmentation but before batching.
+            target_sizes: This is a list of tuples (or torch tensors) corresponding to the requested final size
+                          of each prediction. If left to None, it will default to the processed_sizes
+            """
+        if target_sizes is None:
+            target_sizes = processed_sizes
+        assert len(processed_sizes) == len(target_sizes)
+        out_logits, raw_masks, raw_boxes = outputs["pred_logits"], outputs["pred_masks"], outputs["pred_boxes"]
+        assert len(out_logits) == len(raw_masks) == len(target_sizes)
+        preds = []
+
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.cpu().tolist())
+
+        for cur_logits, cur_masks, cur_boxes, size, target_size in zip(
+            out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes
+        ):
+            # we filter empty queries and detection below threshold
+            scores, labels = cur_logits.softmax(-1).max(-1)
+            keep = labels.ne(outputs["pred_logits"].shape[-1] - 1) & (scores > self.threshold)
+            cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
+            cur_scores = cur_scores[keep]
+            cur_classes = cur_classes[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_boxes = box_ops.box_cxcywh_to_xyxy(cur_boxes[keep])
+
+            h, w = cur_masks.shape[-2:]
+            assert len(cur_boxes) == len(cur_classes)
+
+            # It may be that we have several predicted masks for the same stuff class.
+            # In the following, we track the list of masks ids for each stuff class (they are merged later on)
+            cur_masks = cur_masks.flatten(1)
+            stuff_equiv_classes = defaultdict(lambda: [])
+            for k, label in enumerate(cur_classes):
+                if not self.is_thing_map[label.item()]:
+                    stuff_equiv_classes[label.item()].append(k)
+
+            def get_ids_area(masks, scores, dedup=False):
+                # This helper function creates the final panoptic segmentation image
+                # It also returns the area of the masks that appears on the image
+
+                m_id = masks.transpose(0, 1).softmax(-1)
+
+                if m_id.shape[-1] == 0:
+                    # We didn't detect any mask :(
+                    m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device)
+                else:
+                    m_id = m_id.argmax(-1).view(h, w)
+
+                if dedup:
+                    # Merge the masks corresponding to the same stuff class
+                    for equiv in stuff_equiv_classes.values():
+                        if len(equiv) > 1:
+                            for eq_id in equiv:
+                                m_id.masked_fill_(m_id.eq(eq_id), equiv[0])
+
+                final_h, final_w = to_tuple(target_size)
+
+                seg_img = Image.fromarray(id2rgb(m_id.view(h, w).cpu().numpy()))
+                seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST)
+
+                np_seg_img = (
+                    torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())).view(final_h, final_w, 3).numpy()
+                )
+                m_id = torch.from_numpy(rgb2id(np_seg_img))
+
+                area = []
+                for i in range(len(scores)):
+                    area.append(m_id.eq(i).sum().item())
+                return area, seg_img
+
+            area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True)
+            if cur_classes.numel() > 0:
+                # We know filter empty masks as long as we find some
+                while True:
+                    filtered_small = torch.as_tensor(
+                        [area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device
+                    )
+                    if filtered_small.any().item():
+                        cur_scores = cur_scores[~filtered_small]
+                        cur_classes = cur_classes[~filtered_small]
+                        cur_masks = cur_masks[~filtered_small]
+                        area, seg_img = get_ids_area(cur_masks, cur_scores)
+                    else:
+                        break
+
+            else:
+                cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device)
+
+            segments_info = []
+            for i, a in enumerate(area):
+                cat = cur_classes[i].item()
+                segments_info.append({"id": i, "isthing": self.is_thing_map[cat], "category_id": cat, "area": a})
+            del cur_classes
+
+            with io.BytesIO() as out:
+                seg_img.save(out, format="PNG")
+                predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
+            preds.append(predictions)
+        return preds
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/transformer.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcd536750acbfea7e4d514acb6e60154dc28ddbd
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/transformer.py
@@ -0,0 +1,297 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+DETR Transformer class.
+
+Copy-paste from torch.nn.Transformer with modifications:
+    * positional encodings are passed in MHattention
+    * extra LN at the end of encoder is removed
+    * decoder returns a stack of activations from all decoding layers
+"""
+import copy
+from typing import Optional, List
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+
+class Transformer(nn.Module):
+
+    def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
+                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False,
+                 return_intermediate_dec=False):
+        super().__init__()
+
+        encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
+                                                dropout, activation, normalize_before)
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
+
+        decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
+                                                dropout, activation, normalize_before)
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
+                                          return_intermediate=return_intermediate_dec)
+
+        self._reset_parameters()
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, src, mask, query_embed, pos_embed):
+        # flatten NxCxHxW to HWxNxC
+        bs, c, h, w = src.shape
+        src = src.flatten(2).permute(2, 0, 1)
+        pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
+        query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
+        mask = mask.flatten(1)
+
+        tgt = torch.zeros_like(query_embed)
+        memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
+        hs = self.decoder(tgt, memory, memory_key_padding_mask=mask,
+                          pos=pos_embed, query_pos=query_embed)
+        return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
+
+
+class TransformerEncoder(nn.Module):
+
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src,
+                mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        output = src
+
+        for layer in self.layers:
+            output = layer(output, src_mask=mask,
+                           src_key_padding_mask=src_key_padding_mask, pos=pos)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoder(nn.Module):
+
+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+
+    def forward(self, tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        output = tgt
+
+        intermediate = []
+
+        for layer in self.layers:
+            output = layer(output, memory, tgt_mask=tgt_mask,
+                           memory_mask=memory_mask,
+                           tgt_key_padding_mask=tgt_key_padding_mask,
+                           memory_key_padding_mask=memory_key_padding_mask,
+                           pos=pos, query_pos=query_pos)
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+
+        if self.norm is not None:
+            output = self.norm(output)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+class TransformerEncoderLayer(nn.Module):
+
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self,
+                     src,
+                     src_mask: Optional[Tensor] = None,
+                     src_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(src, pos)
+        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
+                              key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward_pre(self, src,
+                    src_mask: Optional[Tensor] = None,
+                    src_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None):
+        src2 = self.norm1(src)
+        q = k = self.with_pos_embed(src2, pos)
+        src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
+                              key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src2 = self.norm2(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
+        src = src + self.dropout2(src2)
+        return src
+
+    def forward(self, src,
+                src_mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
+        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
+
+
+class TransformerDecoderLayer(nn.Module):
+
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self, tgt, memory,
+                     tgt_mask: Optional[Tensor] = None,
+                     memory_mask: Optional[Tensor] = None,
+                     tgt_key_padding_mask: Optional[Tensor] = None,
+                     memory_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward_pre(self, tgt, memory,
+                    tgt_mask: Optional[Tensor] = None,
+                    memory_mask: Optional[Tensor] = None,
+                    tgt_key_padding_mask: Optional[Tensor] = None,
+                    memory_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+
+    def forward(self, tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
+                                    tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(tgt, memory, tgt_mask, memory_mask,
+                                 tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def build_transformer(args):
+    return Transformer(
+        d_model=args.hidden_dim,
+        dropout=args.dropout,
+        nhead=args.nheads,
+        dim_feedforward=args.dim_feedforward,
+        num_encoder_layers=args.enc_layers,
+        num_decoder_layers=args.dec_layers,
+        normalize_before=args.pre_norm,
+        return_intermediate_dec=True,
+    )
+
+
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/requirements.txt b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bb8f7823ba7a47d8edcf966f7652a8bf4ddb86ea
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/requirements.txt
@@ -0,0 +1,9 @@
+cython
+git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI&egg=pycocotools
+submitit
+torch>=1.5.0
+torchvision>=0.6.0
+git+https://github.com/cocodataset/panopticapi.git#egg=panopticapi
+scipy
+onnx
+onnxruntime
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/run_with_submitit.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/run_with_submitit.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6780def01e2f3266b24889403f11d95fffddafe
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/run_with_submitit.py
@@ -0,0 +1,111 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+A script to run multinode training with submitit.
+"""
+import argparse
+import os
+import uuid
+from pathlib import Path
+
+import main as detection
+import submitit
+
+
+def parse_args():
+    detection_parser = detection.get_args_parser()
+    parser = argparse.ArgumentParser("Submitit for detection", parents=[detection_parser])
+    parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node")
+    parser.add_argument("--nodes", default=4, type=int, help="Number of nodes to request")
+    parser.add_argument("--timeout", default=60, type=int, help="Duration of the job")
+    parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.")
+    return parser.parse_args()
+
+
+def get_shared_folder() -> Path:
+    user = os.getenv("USER")
+    if Path("/checkpoint/").is_dir():
+        p = Path(f"/checkpoint/{user}/experiments")
+        p.mkdir(exist_ok=True)
+        return p
+    raise RuntimeError("No shared folder available")
+
+
+def get_init_file():
+    # Init file must not exist, but it's parent dir must exist.
+    os.makedirs(str(get_shared_folder()), exist_ok=True)
+    init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init"
+    if init_file.exists():
+        os.remove(str(init_file))
+    return init_file
+
+
+class Trainer(object):
+    def __init__(self, args):
+        self.args = args
+
+    def __call__(self):
+        import main as detection
+
+        self._setup_gpu_args()
+        detection.main(self.args)
+
+    def checkpoint(self):
+        import os
+        import submitit
+        from pathlib import Path
+
+        self.args.dist_url = get_init_file().as_uri()
+        checkpoint_file = os.path.join(self.args.output_dir, "checkpoint.pth")
+        if os.path.exists(checkpoint_file):
+            self.args.resume = checkpoint_file
+        print("Requeuing ", self.args)
+        empty_trainer = type(self)(self.args)
+        return submitit.helpers.DelayedSubmission(empty_trainer)
+
+    def _setup_gpu_args(self):
+        import submitit
+        from pathlib import Path
+
+        job_env = submitit.JobEnvironment()
+        self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id)))
+        self.args.gpu = job_env.local_rank
+        self.args.rank = job_env.global_rank
+        self.args.world_size = job_env.num_tasks
+        print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+
+
+def main():
+    args = parse_args()
+    if args.job_dir == "":
+        args.job_dir = get_shared_folder() / "%j"
+
+    # Note that the folder will depend on the job_id, to easily track experiments
+    executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30)
+
+    # cluster setup is defined by environment variables
+    num_gpus_per_node = args.ngpus
+    nodes = args.nodes
+    timeout_min = args.timeout
+
+    executor.update_parameters(
+        mem_gb=40 * num_gpus_per_node,
+        gpus_per_node=num_gpus_per_node,
+        tasks_per_node=num_gpus_per_node,  # one task per GPU
+        cpus_per_task=10,
+        nodes=nodes,
+        timeout_min=timeout_min,  # max is 60 * 72
+    )
+
+    executor.update_parameters(name="detr")
+
+    args.dist_url = get_init_file().as_uri()
+    args.output_dir = args.job_dir
+
+    trainer = Trainer(args)
+    job = executor.submit(trainer)
+
+    print("Submitted job_id:", job.job_id)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/env_npu.sh b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/env_npu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..280fca96da61c2d983f9a690fd0c044bbcba9167
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/env_npu.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+export install_path=/usr/local/Ascend
+
+if [ -d ${install_path}/toolkit ]; then
+    export LD_LIBRARY_PATH=/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH}
+    export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH
+    export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH
+    export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH
+    export ASCEND_OPP_PATH=${install_path}/opp
+else
+    if [ -d ${install_path}/nnae/latest ];then
+        export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/nnae/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/nnae/latest
+    else
+        export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest
+    fi
+fi
+
+
+#将Host日志输出到串口,0-关闭/1-开启
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+#设置默认日志级别,0-debug/1-info/2-warning/3-error
+export ASCEND_GLOBAL_LOG_LEVEL=3
+#设置Host侧Event日志开启标志,0-关闭/1-开启
+export ASCEND_GLOBAL_EVENT_ENABLE=0
+#设置是否开启taskque,0-关闭/1-开启
+export TASK_QUEUE_ENABLE=1
+#设置是否开启PTCopy,0-关闭/1-开启
+export PTCOPY_ENABLE=1
+#设置是否开启combined标志,0-关闭/1-开启
+export COMBINED_ENABLE=1
+#设置特殊场景是否需要重新编译,不需要修改
+export DYNAMIC_OP="ADD#MUL"
+#HCCL白名单开关,1-关闭/0-开启
+export HCCL_WHITELIST_DISABLE=1
+#设置Device侧日志等级为error
+${install_path}/driver/tools/msnpureport -g error
+#关闭Device侧Event日志
+${install_path}/driver/tools/msnpureport -e disable
+export BMMV2_ENABLE=1   
+
+path_lib=$(python3.7 -c """
+import sys
+import re
+result=''
+for index in range(len(sys.path)):
+    match_sit = re.search('-packages', sys.path[index])
+    if match_sit is not None:
+        match_lib = re.search('lib', sys.path[index])
+
+        if match_lib is not None:
+            end=match_lib.span()[1]
+            result += sys.path[index][0:end] + ':'
+
+        result+=sys.path[index] + '/torch/lib:'
+print(result)"""
+)
+
+echo ${path_lib}
+
+export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/usr/local/gcc7.3.0/lib64:${LD_LIBRARY_PATH}
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/train_full_8p.sh b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/train_full_8p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f5c6678e2234a3ed7df73137b2b02fa6097eeaa4
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/train_full_8p.sh
@@ -0,0 +1,147 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="DETR_for_PyTorch"
+# 训练batch_size
+batch_size=8
+# 训练使用的npu卡数
+export RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch
+train_epochs=300
+# 学习率
+learning_rate=0.0001
+# 加载数据进程数
+workers=128
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --workers* ]];then
+        workers=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+
+
+device_id_list=0,1,2,3,4,5,6,7
+KERNEL_NUM=$(($(nproc)/8))
+for i in $(seq 0 7)
+do
+    PID_START=$((KERNEL_NUM * i))
+    PID_END=$((PID_START + KERNEL_NUM - 1))
+    taskset -c $PID_START-$PID_END python3.7 -u train_npu.py \
+                    --addr=$(hostname -I |awk '{print $1}') \
+                    --workers=$(nproc) \
+                    --multiprocessing_distributed \
+                    --dist_url='tcp://127.0.0.1:50000' \
+                    --dist_backend='hccl' \
+                    --epochs=${train_epochs} \
+                    --lr=${learning_rate} \
+                    --world_size=1 \
+                    --batch_size=${batch_size} \
+                    --device_num=8 \
+                    --rank=0 \
+                    --device_list=${device_id_list} \
+                    --local_rank=$i > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+done
+
+
+wait
+
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -a 'FPS'  test/output/0/train_0.log|awk -F " " '{print $NF}'|awk -F ":" '{print $2}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+# #输出训练精度,需要模型审视修改
+train_accuracy=`grep -a "Average Precision" test/output/0/train_0.log|awk -F "=" '{print $NF}'|awk 'END {print}'`
+# #打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep "epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_$ASCEND_DEVICE_ID.log|grep -v test|awk -F "|" '{print $6}' | awk -F " " '{print $NF}' >  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/train_performance_1p.sh b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/train_performance_1p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6e9b33650979cfc0c46f96e4afd8899f5a66578b
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/train_performance_1p.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="DETR_for_PyTorch"
+# 训练batch_size
+batch_size=8
+# 训练使用的npu卡数
+export RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch
+train_epochs=1
+# 指定训练所使用的npu device卡id
+device_id=0
+# 加载数据进程数
+workers=128
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --device_id* ]];then
+        device_id=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改
+if [ $ASCEND_DEVICE_ID ];then
+    echo "device id is ${ASCEND_DEVICE_ID}"
+elif [ ${device_id} ];then
+    export ASCEND_DEVICE_ID=${device_id}
+    echo "device id is ${ASCEND_DEVICE_ID}"
+else
+    "[Error] device id must be config"
+    exit 1
+fi
+
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_dirname=${cur_path##*/}
+if [ x"${cur_path_last_dirname}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+
+nohup taskset -c 0-23 python3.7 -u train_npu.py \
+    --coco_path=${data_path} \
+    --workers=${workers} \
+    --gpu=${ASCEND_DEVICE_ID} \
+    --epochs=${train_epochs} \
+    --opt_level='O0' \
+    --batch_size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -a 'FPS'  test/output/0/train_0.log|awk -F " " '{print $NF}'|awk -F ":" '{print $2}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+# #输出训练精度,需要模型审视修改
+train_accuracy=`grep -a "Average Precision" test/output/0/train_0.log|awk -F "=" '{print $NF}'|awk 'END {print}'`
+# #打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep -a loss ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "\t" '{print $2}'|awk -F ":" '{print $2}'|awk 'END {print}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+
+
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/train_performance_8p.sh b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/train_performance_8p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c4182b7a1f04ec6a40aa4020df6270bb6172af21
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/train_performance_8p.sh
@@ -0,0 +1,147 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="DETR_for_PyTorch"
+# 训练batch_size
+batch_size=8
+# 训练使用的npu卡数
+export RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch
+train_epochs=1
+# 学习率
+learning_rate=0.0001
+# 加载数据进程数
+workers=128
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --workers* ]];then
+        workers=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+
+
+device_id_list=0,1,2,3,4,5,6,7
+KERNEL_NUM=$(($(nproc)/8))
+for i in $(seq 0 7)
+do
+    PID_START=$((KERNEL_NUM * i))
+    PID_END=$((PID_START + KERNEL_NUM - 1))
+    taskset -c $PID_START-$PID_END python3.7 -u train_npu.py \
+                    --addr=$(hostname -I |awk '{print $1}') \
+                    --workers=$(nproc) \
+                    --multiprocessing_distributed \
+                    --dist_url='tcp://127.0.0.1:50000' \
+                    --dist_backend='hccl' \
+                    --epochs=${train_epochs} \
+                    --lr=${learning_rate} \
+                    --world_size=1 \
+                    --batch_size=${batch_size} \
+                    --device_num=8 \
+                    --rank=0 \
+                    --device_list=${device_id_list} \
+                    --local_rank=$i > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+done
+
+
+wait
+
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -a 'FPS'  test/output/0/train_0.log|awk -F " " '{print $NF}'|awk -F ":" '{print $2}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+# #输出训练精度,需要模型审视修改
+train_accuracy=`grep -a "Average Precision" test/output/0/train_0.log|awk -F "=" '{print $NF}'|awk 'END {print}'`
+# #打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep "epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_$ASCEND_DEVICE_ID.log|grep -v test|awk -F "|" '{print $6}' | awk -F " " '{print $NF}' >  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/train_npu.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/train_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f7358263440c5a5c417ba68533ae5953d26554b
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/train_npu.py
@@ -0,0 +1,375 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import argparse
+import datetime
+import json
+import random
+import time
+from pathlib import Path
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, DistributedSampler
+
+import datasets
+import util.misc as utils
+from datasets import build_dataset, get_coco_api_from_dataset
+from engine import evaluate, train_one_epoch
+from models import build_model
+
+import apex
+from apex import amp
+from apex.parallel import convert_syncbn_model
+from apex.parallel import DistributedDataParallel
+import torch.distributed as dist
+import os
+import warnings
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('Set transformer detector', add_help=False)
+    parser.add_argument('--lr', default=1e-4, type=float)
+    parser.add_argument('--lr_backbone', default=1e-5, type=float)
+    parser.add_argument('--batch_size', default=8, type=int)
+    parser.add_argument('--weight_decay', default=1e-4, type=float)
+    parser.add_argument('--epochs', default=400, type=int)
+    parser.add_argument('--lr_drop', default=200, type=int)
+    parser.add_argument('--clip_max_norm', default=0.1, type=float,
+                        help='gradient clipping max norm')
+
+    # Model parameters
+    parser.add_argument('--frozen_weights', type=str, default=None,
+                        help="Path to the pretrained model. If set, only the mask head will be trained")
+    # * Backbone
+    parser.add_argument('--backbone', default='resnet50', type=str,
+                        help="Name of the convolutional backbone to use")
+    parser.add_argument('--dilation', action='store_true',
+                        help="If true, we replace stride with dilation in the last convolutional block (DC5)")
+    parser.add_argument('--position_embedding', default='sine', type=str, choices=('sine', 'learned'),
+                        help="Type of positional embedding to use on top of the image features")
+
+    # * Transformer
+    parser.add_argument('--enc_layers', default=6, type=int,
+                        help="Number of encoding layers in the transformer")
+    parser.add_argument('--dec_layers', default=6, type=int,
+                        help="Number of decoding layers in the transformer")
+    parser.add_argument('--dim_feedforward', default=2048, type=int,
+                        help="Intermediate size of the feedforward layers in the transformer blocks")
+    parser.add_argument('--hidden_dim', default=256, type=int,
+                        help="Size of the embeddings (dimension of the transformer)")
+    parser.add_argument('--dropout', default=0.1, type=float,
+                        help="Dropout applied in the transformer")
+    parser.add_argument('--nheads', default=8, type=int,
+                        help="Number of attention heads inside the transformer's attentions")
+    parser.add_argument('--num_queries', default=100, type=int,
+                        help="Number of query slots")
+    parser.add_argument('--pre_norm', action='store_true')
+
+    # * Segmentation
+    parser.add_argument('--masks', action='store_true',
+                        help="Train segmentation head if the flag is provided")
+
+    # Loss
+    parser.add_argument('--no_aux_loss', dest='aux_loss', action='store_false',
+                        help="Disables auxiliary decoding losses (loss at each layer)")
+    # * Matcher
+    parser.add_argument('--set_cost_class', default=1, type=float,
+                        help="Class coefficient in the matching cost")
+    parser.add_argument('--set_cost_bbox', default=5, type=float,
+                        help="L1 box coefficient in the matching cost")
+    parser.add_argument('--set_cost_giou', default=2, type=float,
+                        help="giou box coefficient in the matching cost")
+    # * Loss coefficients
+    parser.add_argument('--mask_loss_coef', default=1, type=float)
+    parser.add_argument('--dice_loss_coef', default=1, type=float)
+    parser.add_argument('--bbox_loss_coef', default=5, type=float)
+    parser.add_argument('--giou_loss_coef', default=2, type=float)
+    parser.add_argument('--eos_coef', default=0.1, type=float,
+                        help="Relative classification weight of the no-object class")
+
+    # dataset parameters
+    parser.add_argument('--dataset_file', default='coco')
+    parser.add_argument('--coco_path', type=str, default='/opt/npu/dataset/coco')
+    parser.add_argument('--coco_panoptic_path', type=str)
+    parser.add_argument('--remove_difficult', action='store_true')
+
+    parser.add_argument('--output_dir', default='',
+                        help='path where to save, empty for no saving')
+    parser.add_argument('--device', default='npu',
+                        help='device to use for training / testing')
+    parser.add_argument('--seed', default=42, type=int)
+    parser.add_argument('--resume', default='', help='resume from checkpoint')
+    parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
+                        help='start epoch')
+    parser.add_argument('--eval', action='store_true')
+    parser.add_argument('--num_workers', default=8, type=int)
+
+    # edit this for 8p
+    parser.add_argument('--dist-backend', type=str, default='hccl')
+    parser.add_argument('--distributed', type=bool, default=True)
+    parser.add_argument('--world-size', type=int, default=-1)
+    parser.add_argument('--rank', type=int, default=-1)
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument('--addr', type=str, default='127.0.0.1')
+    parser.add_argument('--device_num', type=int, default=-1)
+    parser.add_argument('--workers', type=int, default=32)
+    parser.add_argument('--device-list', default='', type=str)
+    parser.add_argument('--dist-url', type=str, default='tcp://127.0.0.1:50000')
+    parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.')
+    parser.add_argument('--multiprocessing-distributed', action='store_true',
+                        help='Use multi-processing distributed training to launch '
+                             'N processes per node, which has N GPUs. This is the '
+                             'fastest way to use PyTorch for either single node or '
+                             'multi node data parallel training')
+    warnings.filterwarnings('ignore')
+    #############end#################
+    return parser
+
+def main(args):
+    torch.manual_seed(args.seed)
+    ##############################
+    # edit this for 8p
+    os.environ['MASTER_ADDR'] = args.addr
+    os.environ['MASTER_PORT'] = '29888'
+    os.environ['LOCAL_DEVICE_ID'] = str(0)
+    print("+++++++++++++++++++++++++++LOCAL_DEVICE_ID:", os.environ['LOCAL_DEVICE_ID'])
+    if args.dist_url == "env://" and args.world_size == -1:
+        args.world_size = int(os.environ["WORLD_SIZE"])
+        print('==========>args.world_size: ', args.world_size)
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+
+    if args.device_list != '':
+        ngpus_per_node = len(args.device_list.split(','))
+    elif args.device_num != -1:
+        ngpus_per_node = args.device_num
+    elif args.device == 'npu':
+        ngpus_per_node = int(os.environ["RANK_SIZE"])
+    else:
+        ngpus_per_node = torch.cuda.device_count()
+    if args.multiprocessing_distributed:
+        # Since we have ngpus_per_node processes per node, the total world_size
+        # needs to be adjusted accordingly
+        args.world_size = ngpus_per_node * args.world_size
+        # Use torch.multiprocessing.spawn to launch distributed processes: the
+        # main_worker process function
+        # The child process uses the environment variables of the parent process,
+        # we have to set LOCAL_DEVICE_ID for every proc
+        if args.device == 'npu':
+            main_worker(args.local_rank, ngpus_per_node, args)
+        else:
+            mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+    else:
+        # Simply call main_worker function
+        main_worker(args.gpu, ngpus_per_node, args)
+    ##############################
+
+def main_worker(gpu, ngpus_per_node, args):
+
+    if args.frozen_weights is not None:
+        assert args.masks, "Frozen training is meant for segmentation only"
+    #####################begin##############################
+    if args.device_list != '':
+        print(args.device_list)
+        args.gpu = int(args.device_list.split(',')[gpu])
+    else:
+        args.gpu = gpu
+
+    print("[npu id:", args.gpu, "]", "++++++++++++++++ before set LOCAL_DEVICE_ID:", os.environ['LOCAL_DEVICE_ID'])
+    os.environ['LOCAL_DEVICE_ID'] = str(args.gpu)
+    print("[npu id:", args.gpu, "]", "++++++++++++++++ LOCAL_DEVICE_ID:", os.environ['LOCAL_DEVICE_ID'])
+
+    if args.gpu is not None:
+        print("[npu id:", args.gpu, "]", "Use GPU: {} for training".format(args.gpu))
+
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            # For multiprocessing distributed training, rank needs to be the
+            # global rank among all the processes
+            args.rank = args.rank * ngpus_per_node + gpu
+        if args.device == 'npu':
+            dist.init_process_group(backend=args.dist_backend,                    # init_method=args.dist_url,
+                                    world_size=args.world_size, rank=args.rank)
+
+        else:
+            dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                    world_size=args.world_size, rank=args.rank)
+
+    loc = 'npu:{}'.format(args.gpu)
+    torch.npu.set_device(loc)
+
+    args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+
+    print("[npu id:", args.gpu, "]", "===============main_worker()=================")
+    print("[npu id:", args.gpu, "]", args)
+    print("[npu id:", args.gpu, "]", "===============main_worker()=================")
+    ##################end################
+
+    # device = torch.device(args.device)
+
+    # fix the seed for reproducibility
+    seed = args.seed + utils.get_rank()
+    np.random.seed(seed)
+    random.seed(seed)
+
+    model, criterion, postprocessors = build_model(args)
+    # model.to(device)
+    # model = convert_syncbn_model(model)
+    model = model.to(loc)
+    model_without_ddp = model
+    param_dicts = [
+        {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]},
+        {
+            "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad],
+            "lr": args.lr_backbone,
+        },
+    ]
+
+    # utils.init_distributed_mode(args)
+    print("git:\n  {}\n".format(utils.get_sha()))
+
+    optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
+                                  weight_decay=args.weight_decay)
+
+    # optimizer = apex.optimizers.NpuFusedAdamW(param_dicts, lr=args.lr,
+    #                                           weight_decay=args.weight_decay)
+
+    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)
+
+    # model.to(device)
+    opt_level = 'O0'
+    model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
+    for ls in amp._amp_state.loss_scalers:
+        ls._scale_seq_len = 50
+        ls._loss_scale = 2. ** 24
+
+
+    if args.distributed:
+        # model = DistributedDataParallel(model)
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False)
+        model_without_ddp = model.module
+    # n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    n_parameters = sum(p.numel() for p in model.parameters())
+    print('number of params:', n_parameters)
+
+    dataset_train = build_dataset(image_set='train', args=args)
+    dataset_val = build_dataset(image_set='val', args=args)
+
+    if args.distributed:
+        sampler_train = DistributedSampler(dataset_train)
+        sampler_val = DistributedSampler(dataset_val, shuffle=False)
+    else:
+        sampler_train = torch.utils.data.RandomSampler(dataset_train)
+        sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+
+    batch_sampler_train = torch.utils.data.BatchSampler(
+        sampler_train, args.batch_size, drop_last=True)
+
+    data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train,
+                                   collate_fn=utils.collate_fn, num_workers=args.num_workers)
+    data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val,
+                                 drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers)
+
+    if args.dataset_file == "coco_panoptic":
+        # We also evaluate AP during panoptic training, on original coco DS
+        coco_val = datasets.coco.build("val", args)
+        base_ds = get_coco_api_from_dataset(coco_val)
+    else:
+        base_ds = get_coco_api_from_dataset(dataset_val)
+
+    if args.frozen_weights is not None:
+        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
+        model_without_ddp.detr.load_state_dict(checkpoint['model'])
+
+    output_dir = Path(args.output_dir)
+    if args.resume:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+        model_without_ddp.load_state_dict(checkpoint['model'])
+        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+            args.start_epoch = checkpoint['epoch'] + 1
+
+    if args.eval:
+        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
+                                              data_loader_val, base_ds, loc, args.output_dir)
+        if args.output_dir:
+            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth")
+        return
+
+    print("Start training")
+    start_time = time.time()
+    best = 0
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            sampler_train.set_epoch(epoch)
+
+        train_stats = train_one_epoch(
+            model, criterion, data_loader_train, optimizer, loc, epoch,
+            args.clip_max_norm)
+        lr_scheduler.step()
+
+        if args.output_dir:
+            checkpoint_paths = [output_dir / 'checkpoint.pth']
+            # extra checkpoint before LR drop and every 100 epochs
+            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
+                checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
+            for checkpoint_path in checkpoint_paths:
+                utils.save_on_master({
+                    'model': model_without_ddp.state_dict(),
+                    'optimizer': optimizer.state_dict(),
+                    'lr_scheduler': lr_scheduler.state_dict(),
+                    'epoch': epoch,
+                    'args': args,
+                }, checkpoint_path)
+
+        test_stats, coco_evaluator = evaluate(
+            model, criterion, postprocessors, data_loader_val, base_ds, loc, args.output_dir
+        )
+
+        map = coco_evaluator.coco_eval['bbox'].stats[0]
+        if map >= best:
+            print(map)
+            best = map
+            utils.save_on_master({
+                'model': model_without_ddp.state_dict(),
+                'optimizer': optimizer.state_dict(),
+                'lr_scheduler': lr_scheduler.state_dict(),
+                'epoch': epoch,
+                'args': args,
+            }, 'output/checkpoint_{}.pth'.format(map))
+
+        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                     **{f'test_{k}': v for k, v in test_stats.items()},
+                     'epoch': epoch,
+                     'n_parameters': n_parameters}
+
+        if args.output_dir and utils.is_main_process():
+            with (output_dir / "log.txt").open("a") as f:
+                f.write(json.dumps(log_stats) + "\n")
+
+            # for evaluation logs
+            if coco_evaluator is not None:
+                (output_dir / 'eval').mkdir(exist_ok=True)
+                if "bbox" in coco_evaluator.coco_eval:
+                    filenames = ['latest.pth']
+                    if epoch % 50 == 0:
+                        filenames.append(f'{epoch:03}.pth')
+                    for name in filenames:
+                        torch.save(coco_evaluator.coco_eval["bbox"].eval,
+                                   output_dir / "eval" / name)
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()])
+    args = parser.parse_args()
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    main(args)
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__init__.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..168f9979a4623806934b0ff1102ac166704e7dec
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/__init__.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/__init__.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ce80ead7a8df0dfa09aa3e9fcd147d99516d47a
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/__init__.cpython-36.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/__init__.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75ce5c70ceb5cb340aed14946d0ec6cec79cec06
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/__init__.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/box_ops.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/box_ops.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..38266ba3db12c3b64edef438c8b1c9e8d99964e5
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/box_ops.cpython-36.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/box_ops.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/box_ops.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af354e4cc519a03b4350ce2dab1c2d0eb96df0b2
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/box_ops.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/misc.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/misc.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cbd42f01283591f7d74d3a07653b1a6c2acf7f8c
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/misc.cpython-36.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/misc.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/misc.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b3b4b214ac31f0fde1248254220e74a874f5f27
Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/misc.cpython-37.pyc differ
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/box_ops.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/box_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c088e5bacc88ff7217fc971f5db889f5bb45b39
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/box_ops.py
@@ -0,0 +1,88 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Utilities for bounding box manipulation and GIoU.
+"""
+import torch
+from torchvision.ops.boxes import box_area
+
+
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
+         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+
+
+def box_xyxy_to_cxcywh(x):
+    x0, y0, x1, y1 = x.unbind(-1)
+    b = [(x0 + x1) / 2, (y0 + y1) / 2,
+         (x1 - x0), (y1 - y0)]
+    return torch.stack(b, dim=-1)
+
+
+# modified from torchvision to also return the union
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/
+
+    The boxes should be in [x0, y0, x1, y1] format
+
+    Returns a [N, M] pairwise matrix, where N = len(boxes1)
+    and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    iou, union = box_iou(boxes1, boxes2)
+
+    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    area = wh[:, :, 0] * wh[:, :, 1]
+
+    return iou - (area - union) / area
+
+
+def masks_to_boxes(masks):
+    """Compute the bounding boxes around the provided masks
+
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+
+    Returns a [N, 4] tensors, with the boxes in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+
+    h, w = masks.shape[-2:]
+
+    y = torch.arange(0, h, dtype=torch.float)
+    x = torch.arange(0, w, dtype=torch.float)
+    y, x = torch.meshgrid(y, x)
+
+    x_mask = (masks * x.unsqueeze(0))
+    x_max = x_mask.flatten(1).max(-1)[0]
+    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    y_mask = (masks * y.unsqueeze(0))
+    y_max = y_mask.flatten(1).max(-1)[0]
+    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/misc.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e08159ebe1860387ac74dc913004454e9d26cbf9
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/misc.py
@@ -0,0 +1,489 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Misc functions, including distributed helpers.
+Mostly copy-paste from torchvision references.
+"""
+import os
+import subprocess
+import time
+from collections import defaultdict, deque
+import datetime
+import pickle
+from typing import Optional, List
+
+import torch
+import torch.distributed as dist
+from torch import Tensor
+
+# needed due to empty tensor bug in pytorch and torchvision 0.5
+import torchvision
+
+if float(torchvision.__version__.split(".")[1]) < 7.0:
+    from torchvision.ops import _new_empty_tensor
+    from torchvision.ops.misc import _output_size
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float16, device='npu')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("npu")
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device="npu")
+    size_list = [torch.tensor([0], device="npu") for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.empty((max_size,), dtype=torch.int8, device="npu"))
+    if local_size != max_size:
+        padding = torch.empty(size=(max_size - local_size,), dtype=torch.int8, device="npu")
+        tensor = torch.cat((tensor, padding), dim=0)
+
+    dist.all_gather(tensor_list, tensor.char())
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, batch_size, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        if torch.npu.is_available():
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}',
+                'max mem: {memory:.0f}',
+                'FPS:{fps:.4f}'
+            ])
+        else:
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}'
+            ])
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                fps = 1 / (float(str(iter_time)) / batch_size)
+                if torch.npu.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.npu.max_memory_allocated() / MB,
+                        fps=fps))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+
+
+def get_sha():
+    cwd = os.path.dirname(os.path.abspath(__file__))
+
+    def _run(command):
+        return subprocess.check_output(command, cwd=cwd).decode('ascii').strip()
+
+    sha = 'N/A'
+    diff = "clean"
+    branch = 'N/A'
+    try:
+        sha = _run(['git', 'rev-parse', 'HEAD'])
+        subprocess.check_output(['git', 'diff'], cwd=cwd)
+        diff = _run(['git', 'diff-index', 'HEAD'])
+        diff = "has uncommited changes" if diff else "clean"
+        branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])
+    except Exception:
+        pass
+    message = f"sha: {sha}, status: {diff}, branch: {branch}"
+    return message
+
+
+def collate_fn(batch):
+    batch = list(zip(*batch))
+    batch[0] = nested_tensor_from_tensor_list(batch[0])
+    return tuple(batch)
+
+
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+
+    def to(self, device):
+        # type: (Device) -> NestedTensor # noqa
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            assert mask is not None
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    # TODO make this more general
+    if tensor_list[0].ndim == 3:
+        if torchvision._is_tracing():
+            # nested_tensor_from_tensor_list() does not export well to ONNX
+            # call _onnx_nested_tensor_from_tensor_list() instead
+            return _onnx_nested_tensor_from_tensor_list(tensor_list)
+
+        # TODO make it support different-sized images
+
+        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        batch_shape = [len(tensor_list)] + max_size
+
+        # batch_shape = [len(tensor_list)] + [3,640,640]
+        # h,w=tensor_list[0].shape[::-2]
+        # pad_value = int(abs(h - w) / 2)
+        # if h > w:
+        #     padded_image = torch.nn.functional.pad(tensor_list[0], (pad_value, pad_value, 0, 0))
+        # else:
+        #     padded_image = torch.nn.functional.pad(tensor_list[0], (0, 0, pad_value, pad_value))
+        # padded_image=padded_image.resize_((3,640,640))
+        # tensor_list=[padded_image]
+        #
+
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            # print(pad_img.shape)
+            # print(img.shape)
+            # p
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], :img.shape[2]] = False
+    else:
+        raise ValueError('not supported')
+    return NestedTensor(tensor, mask)
+
+
+# _onnx_nested_tensor_from_tensor_list() is an implementation of
+# nested_tensor_from_tensor_list() that is supported by ONNX tracing.
+@torch.jit.unused
+def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
+    # max_size = []
+    # for i in range(tensor_list[0].dim()):
+    #     max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64)
+    #     max_size.append(max_size_i)
+    # max_size = tuple(max_size)
+
+    # work around for
+    # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+    # m[: img.shape[1], :img.shape[2]] = False
+    # which is not yet supported in onnx
+    padded_imgs = []
+    padded_masks = []
+    for img in tensor_list:
+        # padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
+        padding = [torch.tensor(0), torch.tensor(0), torch.tensor(0)]
+        padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
+        padded_imgs.append(padded_img)
+
+        m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
+        padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
+        padded_masks.append(padded_mask.to(torch.bool))
+
+    tensor = torch.stack(padded_imgs)
+    mask = torch.stack(padded_masks)
+
+    return NestedTensor(tensor, mask=mask)
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def init_distributed_mode(args):
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.npu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.npu = args.rank % torch.npu.device_count()
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.npu.set_device(args.npu)
+    args.dist_backend = 'hccl'
+    print('| distributed init (rank {}): {}'.format(
+        args.rank, args.dist_url), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+
+
+@torch.no_grad()
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    if target.numel() == 0:
+        return [torch.zeros([], device=output.device)]
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+
+def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
+    """
+    Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
+    This will eventually be supported natively by PyTorch, and this
+    class can go away.
+    """
+    if float(torchvision.__version__.split(".")[1]) < 7.0:
+        if input.numel() > 0:
+            return torch.nn.functional.interpolate(
+                input, size, scale_factor, mode, align_corners
+            )
+
+        output_shape = _output_size(2, input, size, scale_factor)
+        output_shape = list(input.shape[:-2]) + list(output_shape)
+        return _new_empty_tensor(input, output_shape)
+    else:
+        return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/plot_utils.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/plot_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f24bed0d3fe4624aeb231ddd02633f2e58e4bff
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/plot_utils.py
@@ -0,0 +1,107 @@
+"""
+Plotting utilities to visualize training logs.
+"""
+import torch
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+from pathlib import Path, PurePath
+
+
+def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'):
+    '''
+    Function to plot specific fields from training log(s). Plots both training and test results.
+
+    :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file
+              - fields = which results to plot from each log file - plots both training and test for each field.
+              - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots
+              - log_name = optional, name of log file if different than default 'log.txt'.
+
+    :: Outputs - matplotlib plots of results in fields, color coded for each log file.
+               - solid lines are training results, dashed lines are test results.
+
+    '''
+    func_name = "plot_utils.py::plot_logs"
+
+    # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path,
+    # convert single Path to list to avoid 'not iterable' error
+
+    if not isinstance(logs, list):
+        if isinstance(logs, PurePath):
+            logs = [logs]
+            print(f"{func_name} info: logs param expects a list argument, converted to list[Path].")
+        else:
+            raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \
+            Expect list[Path] or single Path obj, received {type(logs)}")
+
+    # Quality checks - verify valid dir(s), that every item in list is Path object, and that log_name exists in each dir
+    for i, dir in enumerate(logs):
+        if not isinstance(dir, PurePath):
+            raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}")
+        if not dir.exists():
+            raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}")
+        # verify log_name exists
+        fn = Path(dir / log_name)
+        if not fn.exists():
+            print(f"-> missing {log_name}.  Have you gotten to Epoch 1 in training?")
+            print(f"--> full path of missing log file: {fn}")
+            return
+
+    # load log file(s) and plot
+    dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs]
+
+    fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5))
+
+    for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))):
+        for j, field in enumerate(fields):
+            if field == 'mAP':
+                coco_eval = pd.DataFrame(
+                    np.stack(df.test_coco_eval_bbox.dropna().values)[:, 1]
+                ).ewm(com=ewm_col).mean()
+                axs[j].plot(coco_eval, c=color)
+            else:
+                df.interpolate().ewm(com=ewm_col).mean().plot(
+                    y=[f'train_{field}', f'test_{field}'],
+                    ax=axs[j],
+                    color=[color] * 2,
+                    style=['-', '--']
+                )
+    for ax, field in zip(axs, fields):
+        ax.legend([Path(p).name for p in logs])
+        ax.set_title(field)
+
+
+def plot_precision_recall(files, naming_scheme='iter'):
+    if naming_scheme == 'exp_id':
+        # name becomes exp_id
+        names = [f.parts[-3] for f in files]
+    elif naming_scheme == 'iter':
+        names = [f.stem for f in files]
+    else:
+        raise ValueError(f'not supported {naming_scheme}')
+    fig, axs = plt.subplots(ncols=2, figsize=(16, 5))
+    for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names):
+        data = torch.load(f)
+        # precision is n_iou, n_points, n_cat, n_area, max_det
+        precision = data['precision']
+        recall = data['params'].recThrs
+        scores = data['scores']
+        # take precision for all classes, all areas and 100 detections
+        precision = precision[0, :, :, 0, -1].mean(1)
+        scores = scores[0, :, :, 0, -1].mean(1)
+        prec = precision.mean()
+        rec = data['recall'][0, :, 0, -1].mean()
+        print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' +
+              f'score={scores.mean():0.3f}, ' +
+              f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}'
+              )
+        axs[0].plot(recall, precision, c=color)
+        axs[1].plot(recall, scores, c=color)
+
+    axs[0].set_title('Precision / Recall')
+    axs[0].legend(names)
+    axs[1].set_title('Scores / Recall')
+    axs[1].legend(names)
+    return fig, axs
diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/validate.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/validate.py
new file mode 100644
index 0000000000000000000000000000000000000000..a64e1b2400152648bcf7091a59ecb48dd8a11601
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/validate.py
@@ -0,0 +1,261 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import argparse
+import datetime
+import os
+import random
+import time
+from pathlib import Path
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, DistributedSampler
+from datasets.coco_eval import CocoEvaluator
+import util.misc as utils
+from datasets import build_dataset, get_coco_api_from_dataset
+from models import build_model
+from util import box_ops
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('Set transformer detector', add_help=False)
+    parser.add_argument('--lr', default=1e-4, type=float)
+    parser.add_argument('--lr_backbone', default=1e-5, type=float)
+    parser.add_argument('--batch_size', default=1, type=int)
+    parser.add_argument('--weight_decay', default=1e-4, type=float)
+    parser.add_argument('--epochs', default=300, type=int)
+    parser.add_argument('--lr_drop', default=200, type=int)
+    parser.add_argument('--clip_max_norm', default=0.1, type=float,
+                        help='gradient clipping max norm')
+
+    # Model parameters
+    parser.add_argument('--frozen_weights', type=str, default=None,
+                        help="Path to the pretrained model. If set, only the mask head will be trained")
+    # * Backbone
+    parser.add_argument('--backbone', default='resnet50', type=str,
+                        help="Name of the convolutional backbone to use")
+    parser.add_argument('--dilation', action='store_true',
+                        help="If true, we replace stride with dilation in the last convolutional block (DC5)")
+    parser.add_argument('--position_embedding', default='sine', type=str, choices=('sine', 'learned'),
+                        help="Type of positional embedding to use on top of the image features")
+
+    # * Transformer
+    parser.add_argument('--enc_layers', default=6, type=int,
+                        help="Number of encoding layers in the transformer")
+    parser.add_argument('--dec_layers', default=6, type=int,
+                        help="Number of decoding layers in the transformer")
+    parser.add_argument('--dim_feedforward', default=2048, type=int,
+                        help="Intermediate size of the feedforward layers in the transformer blocks")
+    parser.add_argument('--hidden_dim', default=256, type=int,
+                        help="Size of the embeddings (dimension of the transformer)")
+    parser.add_argument('--dropout', default=0.1, type=float,
+                        help="Dropout applied in the transformer")
+    parser.add_argument('--nheads', default=8, type=int,
+                        help="Number of attention heads inside the transformer's attentions")
+    parser.add_argument('--num_queries', default=100, type=int,
+                        help="Number of query slots")
+    parser.add_argument('--pre_norm', action='store_true')
+
+    # * Segmentation
+    parser.add_argument('--masks', action='store_true',
+                        help="Train segmentation head if the flag is provided")
+
+    # Loss
+    parser.add_argument('--no_aux_loss', dest='aux_loss', action='store_false',
+                        help="Disables auxiliary decoding losses (loss at each layer)")
+    # * Matcher
+    parser.add_argument('--set_cost_class', default=1, type=float,
+                        help="Class coefficient in the matching cost")
+    parser.add_argument('--set_cost_bbox', default=5, type=float,
+                        help="L1 box coefficient in the matching cost")
+    parser.add_argument('--set_cost_giou', default=2, type=float,
+                        help="giou box coefficient in the matching cost")
+    # * Loss coefficients
+    parser.add_argument('--mask_loss_coef', default=1, type=float)
+    parser.add_argument('--dice_loss_coef', default=1, type=float)
+    parser.add_argument('--bbox_loss_coef', default=5, type=float)
+    parser.add_argument('--giou_loss_coef', default=2, type=float)
+    parser.add_argument('--eos_coef', default=0.1, type=float,
+                        help="Relative classification weight of the no-object class")
+
+    # dataset parameters
+    parser.add_argument('--dataset_file', default='coco')
+    parser.add_argument('--coco_path', type=str,default='/home/xu/SJH/datasets/coco')
+    parser.add_argument('--coco_panoptic_path', type=str)
+    parser.add_argument('--remove_difficult', action='store_true')
+
+    parser.add_argument('--output_dir', default='output',
+                        help='path where to save, empty for no saving')
+    parser.add_argument('--device', default='cuda',
+                        help='device to use for training / testing')
+    parser.add_argument('--seed', default=42, type=int)
+    parser.add_argument('--resume', default='model_file/detr.pth', help='resume from checkpoint')
+    parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
+                        help='start epoch')
+    parser.add_argument('--eval', action='store_true',default=True,)
+    parser.add_argument('--num_workers', default=2, type=int)
+
+    # distributed training parameters
+    parser.add_argument('--world_size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
+    return parser
+
+
+@torch.no_grad()
+def evaluate(model, criterion, postprocessors, data_loader, base_ds, device, output_dir):
+    model.eval()
+    criterion.eval()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}'))
+    header = 'Test:'
+    iou_types = tuple(k for k in ('segm', 'bbox') if k in postprocessors.keys())
+    coco_evaluator = CocoEvaluator(base_ds, iou_types)
+
+    # ort_session = onnxruntime.InferenceSession('model_file/detr_640.onnx')
+    for file,(samples, targets) in zip(os.listdir('/home/xu/xiaoxiong/effdet/coco_data/val2017'),metric_logger.log_every(data_loader, 10, header)):
+        samples = samples.to(device)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        print(samples.tensors.shape)
+        print(os.path.join('/home/xu/xiaoxiong/DETR/detr_bin','{}.bin'.format(file.split('.')[0])))
+        samples.tensors.cpu().numpy().tofile(os.path.join('/home/xu/xiaoxiong/DETR/detr_bin','{}.bin'.format(file.split('.')[0])))
+
+        # ort_inputs = {ort_session.get_inputs()[0].name:samples.tensors.cpu().numpy()}
+        # # print('inputs',ort_inputs)
+        # ort_outs = ort_session.run(None, ort_inputs)
+        # out={'pred_logits':torch.from_numpy(ort_outs[0]).cuda(),
+        #      'pred_boxes':torch.from_numpy(ort_outs[1]).cuda()}
+        # outputs=out
+        # loss_dict=criterion(out,targets)
+
+        outputs = model(samples)
+        loss_dict = criterion(outputs, targets)
+        weight_dict = criterion.weight_dict
+
+        # reduce losses over all GPUs for logging purposes
+        loss_dict_reduced = utils.reduce_dict(loss_dict)
+        loss_dict_reduced_scaled = {k: v * weight_dict[k]
+                                    for k, v in loss_dict_reduced.items() if k in weight_dict}
+        loss_dict_reduced_unscaled = {f'{k}_unscaled': v
+                                      for k, v in loss_dict_reduced.items()}
+        metric_logger.update(loss=sum(loss_dict_reduced_scaled.values()),
+                             **loss_dict_reduced_scaled,
+                             **loss_dict_reduced_unscaled)
+        metric_logger.update(class_error=loss_dict_reduced['class_error'])
+        orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
+
+        results = postprocessors['bbox'](outputs, orig_target_sizes)
+
+        # print(len(results[0]['scores']),results[0]['scores'])
+        # print(results[0]['boxes'])
+        # print(results[0]['labels'])
+        # print(postprocessors.keys())
+
+        res = {target['image_id'].item(): output for target, output in zip(targets, results)}
+
+        # if 2592 in res.keys():
+        #     print(orig_target_sizes)
+        #     print(res[2592])
+        #     for i,value in enumerate(res[2592]['scores']):
+        #         if value>0.5:
+        #             print(i,value)
+        #             print(res[2592]['boxes'][i])
+        #     p
+        coco_evaluator.update(res)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    coco_evaluator.synchronize_between_processes()
+
+    # accumulate predictions from all images
+    coco_evaluator.accumulate()
+    coco_evaluator.summarize()
+    stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist()
+
+    return stats, coco_evaluator
+
+def main(args):
+    utils.init_distributed_mode(args)
+    print("git:\n  {}\n".format(utils.get_sha()))
+
+    if args.frozen_weights is not None:
+        assert args.masks, "Frozen training is meant for segmentation only"
+    print(args)
+
+    device = torch.device(args.device)
+
+    # fix the seed for reproducibility
+    seed = args.seed + utils.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+    model, criterion, postprocessors = build_model(args)
+    model.to(device)
+
+    model_without_ddp = model
+    print(args.distributed)
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print('number of params:', n_parameters)
+
+    param_dicts = [
+        {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]},
+        {
+            "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad],
+            "lr": args.lr_backbone,
+        },
+    ]
+    optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
+                                  weight_decay=args.weight_decay)
+    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)
+
+
+    dataset_val = build_dataset(image_set='val', args=args)
+
+    if args.distributed:
+        sampler_val = DistributedSampler(dataset_val, shuffle=False)
+    else:
+        sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+
+    data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val,
+                                 drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers)
+
+
+    base_ds = get_coco_api_from_dataset(dataset_val)
+
+    if args.frozen_weights is not None:
+        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
+        model_without_ddp.detr.load_state_dict(checkpoint['model'])
+
+    output_dir = Path(args.output_dir)
+    if args.resume:
+        checkpoint = torch.load(args.resume, map_location='cpu')
+        model_without_ddp.load_state_dict(checkpoint['model'])
+        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+            args.start_epoch = checkpoint['epoch'] + 1
+
+    start_time = time.time()
+    if args.eval:
+        print('start validate')
+        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
+                                              data_loader_val, base_ds, device, args.output_dir)
+        if args.output_dir:
+            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth")
+        return
+
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()])
+    args = parser.parse_args()
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    main(args)
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/.keep b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/.keep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/Dockerfile b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..360861ede17fb0ab697fbcac190acde7c1e29fef
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/Dockerfile
@@ -0,0 +1,5 @@
+ARG FROM_IMAGE_NAME
+FROM ${FROM_IMAGE_NAME}
+
+COPY requirements.txt .
+RUN pip3.7 install -r requirements.txt
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/Readme.md b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/Readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..9575325dc67228acfddd34d75da5926015248460
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/Readme.md
@@ -0,0 +1,62 @@
+# Transformer-xl
+
+This implements training of transformer-xl on the enwik8 dataset, mainly modified from [pytorch/examples](https://github.com/kimiyoung/transformer-xl/tree/master/pytorch).
+
+## Transformer-xl Detail
+
+As of the current date, Ascend-Pytorch is still inefficient for contiguous operations.Therefore, Transformer-xl is re-implemented using semantics such as custom OP.
+
+
+## Requirements
+
+- Install PyTorch ([pytorch.org](http://pytorch.org))
+- `pip install -r requirements.txt`
+
+## Data Prepration
+- `bash getdata.sh`
+
+## Training and Evaluation
+
+To train a model, run `bash test/train_full_8p.sh` with the desired model architecture and the path to the enwik8 dataset:
+
+
+```bash
+#env
+cd transformer-xl
+dos2unix ./test/*.sh
+
+# 1p train perf
+bash test/train_performance_1p.sh --data_path=xxxx 
+
+# 8p train perf
+bash test/train_performance_8p.sh --data_path=xxxx 
+
+# 8p train full
+bash test/train_full_8p.sh --data_path=xxxx 
+
+# 1p eval
+bash test/train_eval_1p.sh --data_path=xxxx --pth_path=xxxx
+
+```
+
+- 参数说明：
+```bash
+#--data               //数据集路径,可自行修改为对应路径的数据集
+#--restart_dir        //加载模型checkpoint路径，可自行修改为对应路径的模型文件
+#--addr               //主机地址 
+#--max_step           //最大训练步数 
+#--batch-size         //训练批次大小 
+#--lr                 //初始学习率，默认：0.00025
+#--device-list        //多卡训练指定训练用卡 ,8卡：'0,1,2,3,4,5,6,7'
+#--amp                //是否使用混合精度 
+#--loss-scale         //lossscale大小 
+#--opt-level          //混合精度类型
+```
+
+
+## Transformer-xl training result
+
+| bpc      | FPS       | Npu_nums | Epochs   | AMP_Type |
+| :------: | :------:  | :------: | :------: | :------: |
+| -        | 8300      | 1        | 1        | O2       |
+| 1.09     | 44500     | 8        | 50       | O2       |
\ No newline at end of file
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/data_utils.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3642f37ceff7853d0e1eecb85c1232eb5ead5bd0
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/data_utils.py
@@ -0,0 +1,258 @@
+import os, sys
+import glob
+import numpy as np
+import torch
+
+from utils.vocabulary import Vocab
+
+
+class LMOrderedIterator(object):
+    def __init__(self, data, bsz, bptt, device='npu:0', ext_len=None):
+        """
+            data -- LongTensor -- the LongTensor is strictly ordered
+        """
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+        self.device = device
+
+        # Work out how cleanly we can divide the dataset into bsz parts.
+        self.n_step = data.size(0) // bsz
+
+        # Trim off any extra elements that wouldn't cleanly fit (remainders).
+        data = data.narrow(0, 0, self.n_step * bsz)
+
+        # Evenly divide the data across the bsz batches.
+        self.data = data.view(bsz, -1).t().contiguous().to(device)
+
+        # Number of mini-batches
+        self.n_batch = (self.n_step + self.bptt - 1) // self.bptt
+
+    def get_batch(self, i, bptt=None):
+        if bptt is None: bptt = self.bptt
+        seq_len = min(bptt, self.data.size(0) - 1 - i)
+
+        end_idx = i + seq_len
+        beg_idx = max(0, i - self.ext_len)
+
+        data = self.data[beg_idx:end_idx]
+        target = self.data[i+1:i+1+seq_len]
+
+        return data, target, seq_len
+
+    def get_fixlen_iter(self, start=0):
+        for i in range(start, self.data.size(0) - 1, self.bptt):
+            yield self.get_batch(i)
+
+    def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3):
+        max_len = self.bptt + max_deviation * std
+        i = start
+        while True:
+            bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
+            bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
+            data, target, seq_len = self.get_batch(i, bptt)
+            i += seq_len
+            yield data, target, seq_len
+            if i >= self.data.size(0) - 2:
+                break
+
+    def __iter__(self):
+        return self.get_fixlen_iter()
+
+
+class LMShuffledIterator(object):
+    def __init__(self, data, bsz, bptt, device='npu:0', ext_len=None, shuffle=False):
+        """
+            data -- list[LongTensor] -- there is no order among the LongTensors
+        """
+        self.data = data
+
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+        self.shuffle = shuffle
+
+    def get_sent_stream(self):
+        # index iterator
+        epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \
+            else np.array(range(len(self.data)))
+
+        # sentence iterator
+        for idx in epoch_indices:
+            yield self.data[idx]
+
+    def stream_iterator(self, sent_stream):
+        # streams for each data in the batch
+        streams = [None] * self.bsz
+
+        data = torch.LongTensor(self.bptt, self.bsz)
+        target = torch.LongTensor(self.bptt, self.bsz)
+
+        n_retain = 0
+
+        while True:
+            # data   : [n_retain+bptt x bsz]
+            # target : [bptt x bsz]
+            data[n_retain:].fill_(-1)
+            target.fill_(-1)
+
+            valid_batch = True
+
+            for i in range(self.bsz):
+                n_filled = 0
+                try:
+                    while n_filled < self.bptt:
+                        if streams[i] is None or len(streams[i]) <= 1:
+                            streams[i] = next(sent_stream)
+                        # number of new tokens to fill in
+                        n_new = min(len(streams[i]) - 1, self.bptt - n_filled)
+                        # first n_retain tokens are retained from last batch
+                        data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \
+                            streams[i][:n_new]
+                        target[n_filled:n_filled+n_new, i] = \
+                            streams[i][1:n_new+1]
+                        streams[i] = streams[i][n_new:]
+                        n_filled += n_new
+                except StopIteration:
+                    valid_batch = False
+                    break
+
+            if not valid_batch:
+                return
+
+            data = data.to(self.device)
+            target = target.to(self.device)
+
+            yield data, target, self.bptt
+
+            n_retain = min(data.size(0), self.ext_len)
+            if n_retain > 0:
+                data[:n_retain] = data[-n_retain:]
+            data.resize_(n_retain + self.bptt, data.size(1))
+
+    def __iter__(self):
+        # sent_stream is an iterator
+        sent_stream = self.get_sent_stream()
+
+        for batch in self.stream_iterator(sent_stream):
+            yield batch
+
+
+class LMMultiFileIterator(LMShuffledIterator):
+    def __init__(self, paths, vocab, bsz, bptt, device='npu:0', ext_len=None,
+        shuffle=False):
+
+        self.paths = paths
+        self.vocab = vocab
+
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+        self.shuffle = shuffle
+
+    def get_sent_stream(self, path):
+        sents = self.vocab.encode_file(path, add_double_eos=True)
+        if self.shuffle:
+            np.random.shuffle(sents)
+        sent_stream = iter(sents)
+
+        return sent_stream
+
+    def __iter__(self):
+        if self.shuffle:
+            np.random.shuffle(self.paths)
+
+        for path in self.paths:
+            # sent_stream is an iterator
+            sent_stream = self.get_sent_stream(path)
+            for batch in self.stream_iterator(sent_stream):
+                yield batch
+
+
+class Corpus(object):
+    def __init__(self, path, dataset, *args, **kwargs):
+        self.dataset = dataset
+        self.vocab = Vocab(*args, **kwargs)
+
+        if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']:
+            self.vocab.count_file(os.path.join(path, 'train.txt'))
+            self.vocab.count_file(os.path.join(path, 'valid.txt'))
+            self.vocab.count_file(os.path.join(path, 'test.txt'))
+        elif self.dataset == 'wt103':
+            self.vocab.count_file(os.path.join(path, 'train.txt'))
+        elif self.dataset == 'lm1b':
+            train_path_pattern = os.path.join(
+                path, '1-billion-word-language-modeling-benchmark-r13output',
+                'training-monolingual.tokenized.shuffled', 'news.en-*')
+            train_paths = glob.glob(train_path_pattern)
+            # the vocab will load from file when build_vocab() is called
+
+        self.vocab.build_vocab()
+
+        if self.dataset in ['ptb', 'wt2', 'wt103']:
+            self.train = self.vocab.encode_file(
+                os.path.join(path, 'train.txt'), ordered=True)
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=True)
+            self.test  = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=True)
+        elif self.dataset in ['enwik8', 'text8']:
+            self.train = self.vocab.encode_file(
+                os.path.join(path, 'train.txt'), ordered=True, add_eos=False)
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=True, add_eos=False)
+            self.test  = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=True, add_eos=False)
+        elif self.dataset == 'lm1b':
+            self.train = train_paths
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True)
+            self.test  = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True)
+
+    def get_iterator(self, split, *args, **kwargs):
+        if split == 'train':
+            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
+                data_iter = LMOrderedIterator(self.train, *args, **kwargs)
+            elif self.dataset == 'lm1b':
+                kwargs['shuffle'] = True
+                data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
+        elif split in ['valid', 'test.py']:
+            data = self.valid if split == 'valid' else self.test
+            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
+                data_iter = LMOrderedIterator(data, *args, **kwargs)
+            elif self.dataset == 'lm1b':
+                data_iter = LMShuffledIterator(data, *args, **kwargs)
+
+        return data_iter
+
+
+def get_lm_corpus(datadir, dataset):
+    fn = os.path.join(datadir, 'cache.pt')
+    if os.path.exists(fn):
+        print('Loading cached dataset...')
+        corpus = torch.load(fn)
+    else:
+        print('Producing dataset {}...'.format(dataset))
+        kwargs = {}
+        if dataset in ['wt103', 'wt2']:
+            kwargs['special'] = ['<eos>']
+            kwargs['lower_case'] = False
+        elif dataset == 'ptb':
+            kwargs['special'] = ['<eos>']
+            kwargs['lower_case'] = True
+        elif dataset == 'lm1b':
+            kwargs['special'] = []
+            kwargs['lower_case'] = False
+            kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt')
+        elif dataset in ['enwik8', 'text8']:
+            pass
+        corpus = Corpus(datadir, dataset, **kwargs)
+        torch.save(corpus, fn)
+    return corpus
+
+
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/eval_npu.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/eval_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cc3b6a73c7957fd2ff286ed41ef199263108385
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/eval_npu.py
@@ -0,0 +1,378 @@
+# coding: UTF-8
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import time
+import math
+import os
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+from data_utils import get_lm_corpus
+from mem_transformer import MemTransformerLM
+from utils.exp_utils import create_exp_dir
+from apex import amp
+import apex
+from utils.exp_utils import get_logger
+
+parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
+parser.add_argument('--data', type=str, default='../data/enwik8',
+                    help='location of the data corpus')
+parser.add_argument('--dataset', type=str, default='enwik8',
+                    choices=['wt103', 'lm1b', 'enwik8', 'text8'],
+                    help='dataset name')
+parser.add_argument('--n_layer', type=int, default=12,
+                    help='number of total layers')
+parser.add_argument('--n_head', type=int, default=8,
+                    help='number of heads')
+parser.add_argument('--d_head', type=int, default=64,
+                    help='head dimension')
+parser.add_argument('--d_embed', type=int, default=-1,
+                    help='embedding dimension')
+parser.add_argument('--d_model', type=int, default=512,
+                    help='model dimension')
+parser.add_argument('--d_inner', type=int, default=2048,
+                    help='inner dimension in FF')
+parser.add_argument('--dropout', type=float, default=0.1,
+                    help='global dropout rate')
+parser.add_argument('--dropatt', type=float, default=0.0,
+                    help='attention probability dropout rate')
+parser.add_argument('--init', default='normal', type=str,
+                    help='parameter initializer to use.')
+parser.add_argument('--emb_init', default='normal', type=str,
+                    help='parameter initializer to use.')
+parser.add_argument('--init_range', type=float, default=0.1,
+                    help='parameters initialized by U(-init_range, init_range)')
+parser.add_argument('--emb_init_range', type=float, default=0.01,
+                    help='parameters initialized by U(-init_range, init_range)')
+parser.add_argument('--init_std', type=float, default=0.02,
+                    help='parameters initialized by N(0, init_std)')
+parser.add_argument('--proj_init_std', type=float, default=0.01,
+                    help='parameters initialized by N(0, init_std)')
+parser.add_argument('--optim', default='adam', type=str,
+                    choices=['adam', 'sgd', 'adagrad'],
+                    help='optimizer to use.')
+parser.add_argument('--lr', type=float, default=0.00025,
+                    help='initial learning rate (0.00025|5 for adam|sgd)')
+parser.add_argument('--mom', type=float, default=0.0,
+                    help='momentum for sgd')
+parser.add_argument('--scheduler', default='cosine', type=str,
+                    choices=['cosine', 'inv_sqrt', 'dev_perf', 'constant'],
+                    help='lr scheduler to use.')
+parser.add_argument('--warmup_step', type=int, default=0,
+                    help='upper epoch limit')
+parser.add_argument('--decay_rate', type=float, default=0.5,
+                    help='decay factor when ReduceLROnPlateau is used')
+parser.add_argument('--lr_min', type=float, default=0.0,
+                    help='minimum learning rate during annealing')
+parser.add_argument('--clip', type=float, default=0.25,
+                    help='gradient clipping')
+parser.add_argument('--clip_nonemb', action='store_true',
+                    help='only clip the gradient of non-embedding params')
+parser.add_argument('--max_step', type=int, default=100000,
+                    help='upper epoch limit')
+parser.add_argument('--batch_size', type=int, default=10,
+                    help='batch size')
+parser.add_argument('--batch_chunk', type=int, default=1,
+                    help='split batch into chunks to save memory')
+parser.add_argument('--tgt_len', type=int, default=512,
+                    help='number of tokens to predict')
+parser.add_argument('--eval_tgt_len', type=int, default=128,
+                    help='number of tokens to predict for evaluation')
+parser.add_argument('--ext_len', type=int, default=0,
+                    help='length of the extended context')
+parser.add_argument('--mem_len', type=int, default=512,
+                    help='length of the retained previous heads')
+parser.add_argument('--not_tied', action='store_true',
+                    help='do not tie the word embedding and softmax weights')
+parser.add_argument('--seed', type=int, default=1111,
+                    help='random seed')
+parser.add_argument('--npu', default=True, help='use NPU')
+parser.add_argument('--adaptive', action='store_true',
+                    help='use adaptive softmax')
+parser.add_argument('--div_val', type=int, default=1,
+                    help='divident value for adapative input and softmax')
+parser.add_argument('--pre_lnorm', action='store_true',
+                    help='apply LayerNorm to the input instead of the output')
+parser.add_argument('--varlen', action='store_true',
+                    help='use variable length')
+parser.add_argument('--multi_gpu', action='store_true',
+                    help='use multiple GPU')
+parser.add_argument('--log-interval', type=int, default=200,
+                    help='report interval')
+parser.add_argument('--eval-interval', type=int, default=4000,
+                    help='evaluation interval')
+parser.add_argument('--work_dir', default='LM-TFM', type=str,
+                    help='experiment directory.')
+parser.add_argument('--restart', action='store_true',
+                    help='restart training from the saved checkpoint')
+parser.add_argument('--restart_dir', type=str, default='',
+                    help='restart dir')
+parser.add_argument('--pth', type=str, default='',
+                    help='eval checkpoint')
+parser.add_argument('--debug', action='store_true',
+                    help='run in debug mode (do not create exp dir)')
+parser.add_argument('--same_length', action='store_true',
+                    help='use the same attn length for all tokens')
+parser.add_argument('--attn_type', type=int, default=0,
+                    help='attention type. 0 for ours, 1 for Shaw et al,'
+                    '2 for Vaswani et al, 3 for Al Rfou et al.')
+parser.add_argument('--clamp_len', type=int, default=-1,
+                    help='use the same pos embeddings after clamp_len')
+parser.add_argument('--eta_min', type=float, default=0.0,
+                    help='min learning rate for cosine scheduler')
+parser.add_argument('--gpu0_bsz', type=int, default=-1,
+                    help='batch size on gpu 0')
+parser.add_argument('--max_eval_steps', type=int, default=-1,
+                    help='max eval steps')
+parser.add_argument('--sample_softmax', type=int, default=-1,
+                    help='number of samples in sampled softmax')
+parser.add_argument('--patience', type=int, default=0,
+                    help='patience')
+parser.add_argument('--finetune_v2', action='store_true',
+                    help='finetune v2')
+parser.add_argument('--finetune_v3', action='store_true',
+                    help='finetune v3')
+parser.add_argument('--static-loss-scale', type=float, default=128.0,
+                    help='Static loss scale, positive power of 2 values can '
+                    'improve fp16 convergence.')
+parser.add_argument('--dynamic-loss-scale', action='store_true',
+                    help='Use dynamic loss scaling.  If supplied, this argument'
+                    ' supersedes --static-loss-scale.')
+parser.add_argument('--no_log', action='store_true',
+                    help='do not log the eval result')
+parser.add_argument('--split', default='valid',
+                   choices=['all','valid','test'])
+                    
+args = parser.parse_args()
+args.tied = not args.not_tied
+
+if args.d_embed < 0:
+    args.d_embed = args.d_model
+
+assert args.ext_len >= 0, 'extended context length must be non-negative'
+assert args.batch_size % args.batch_chunk == 0
+
+args.work_dir = '{}-{}'.format(args.work_dir, args.dataset)
+args.work_dir = os.path.join(args.work_dir, time.strftime('%Y%m%d-%H%M%S'))
+    
+# Get logger
+logging = create_exp_dir(args.work_dir,
+    scripts_to_save=['train.py', 'mem_transformer.py'], debug=args.debug)
+logging = get_logger('log.txt', log_=not args.no_log)
+                     
+loc = "npu:0"                     
+torch.npu.set_device(loc)
+
+###############################################################################
+# Load data
+###############################################################################
+corpus = get_lm_corpus(args.data, args.dataset)
+ntokens = len(corpus.vocab)
+args.n_token = ntokens
+
+va_iter = corpus.get_iterator('valid', args.batch_size, args.eval_tgt_len,
+    device=loc, ext_len=args.ext_len)
+te_iter = corpus.get_iterator('test.py', args.batch_size, args.eval_tgt_len,
+    device=loc, ext_len=args.ext_len)
+
+# adaptive softmax / embedding
+cutoffs, tie_projs = [], [False]
+if args.adaptive:
+    assert args.dataset in ['wt103', 'lm1b']
+    if args.dataset == 'wt103':
+        cutoffs = [20000, 40000, 200000]
+        tie_projs += [True] * len(cutoffs)
+    elif args.dataset == 'lm1b':
+        cutoffs = [60000, 100000, 640000]
+        tie_projs += [False] * len(cutoffs)
+
+###############################################################################
+# Build the model
+###############################################################################
+def init_weight(weight):
+    if args.init == 'uniform':
+        nn.init.uniform_(weight, -args.init_range, args.init_range)
+    elif args.init == 'normal':
+        nn.init.normal_(weight, 0.0, args.init_std)
+
+def init_bias(bias):
+    nn.init.constant_(bias, 0.0)
+
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Linear') != -1:
+        if hasattr(m, 'weight') and m.weight is not None:
+            init_weight(m.weight)
+        if hasattr(m, 'bias') and m.bias is not None:
+            init_bias(m.bias)
+    elif classname.find('AdaptiveEmbedding') != -1:
+        if hasattr(m, 'emb_projs'):
+            for i in range(len(m.emb_projs)):
+                if m.emb_projs[i] is not None:
+                    nn.init.normal_(m.emb_projs[i], 0.0, args.proj_init_std)
+    elif classname.find('Embedding') != -1:
+        if hasattr(m, 'weight'):
+            init_weight(m.weight)
+    elif classname.find('ProjectedAdaptiveLogSoftmax') != -1:
+        if hasattr(m, 'cluster_weight') and m.cluster_weight is not None:
+            init_weight(m.cluster_weight)
+        if hasattr(m, 'cluster_bias') and m.cluster_bias is not None:
+            init_bias(m.cluster_bias)
+        if hasattr(m, 'out_projs'):
+            for i in range(len(m.out_projs)):
+                if m.out_projs[i] is not None:
+                    nn.init.normal_(m.out_projs[i], 0.0, args.proj_init_std)
+    elif classname.find('LayerNorm') != -1:
+        if hasattr(m, 'weight'):
+            nn.init.normal_(m.weight, 1.0, args.init_std)
+        if hasattr(m, 'bias') and m.bias is not None:
+            init_bias(m.bias)
+    elif classname.find('TransformerLM') != -1:
+        if hasattr(m, 'r_emb'):
+            init_weight(m.r_emb)
+        if hasattr(m, 'r_w_bias'):
+            init_weight(m.r_w_bias)
+        if hasattr(m, 'r_r_bias'):
+            init_weight(m.r_r_bias)
+        if hasattr(m, 'r_bias'):
+            init_bias(m.r_bias)
+
+def update_dropout(m):
+    classname = m.__class__.__name__
+    if classname.find('Dropout') != -1:
+        if hasattr(m, 'p'):
+            m.p = args.dropout
+
+def update_dropatt(m):
+    if hasattr(m, 'dropatt'):
+        m.dropatt.p = args.dropatt
+
+model = MemTransformerLM(ntokens, args.n_layer, args.n_head, args.d_model,
+    args.d_head, args.d_inner, args.dropout, args.dropatt,
+    tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val,
+    tie_projs=tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len,
+    ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs,
+    same_length=args.same_length, attn_type=args.attn_type,
+    clamp_len=args.clamp_len, sample_softmax=args.sample_softmax)
+model.apply(weights_init)
+model.word_emb.apply(weights_init)
+args.n_all_param = sum([p.nelement() for p in model.parameters()])
+args.n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()])
+
+model = model.to(loc)
+
+#### optimizer
+if args.optim.lower() == 'sgd':
+    if args.sample_softmax > 0:
+        dense_params, sparse_params = [], []
+        for param in model.parameters():
+            if param.size() == model.word_emb.weight.size():
+                sparse_params.append(param)
+            else:
+                dense_params.append(param)
+        optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2)
+        optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom)
+    else:
+        optimizer = optim.SGD(model.parameters(), lr=args.lr,
+            momentum=args.mom)
+elif args.optim.lower() == 'adam':
+    if args.sample_softmax > 0:
+        dense_params, sparse_params = [], []
+        for param in model.parameters():
+            if param.size() == model.word_emb.weight.size():
+                sparse_params.append(param)
+            else:
+                dense_params.append(param)
+        optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr)
+        optimizer = optim.Adam(dense_params, lr=args.lr)
+    else:
+        optimizer = apex.optimizers.NpuFusedAdam(model.parameters(), lr=args.lr)
+elif args.optim.lower() == 'adagrad':
+    optimizer = optim.Adagrad(model.parameters(), lr=args.lr)
+
+
+
+logging('=' * 100)
+logging('#params = {}'.format(args.n_all_param))
+logging('#non emb params = {}'.format(args.n_nonemb_param))
+
+# Load the best saved model.
+with open(args.pth, 'rb') as f:
+    model.load_state_dict(torch.load(f, map_location=loc))
+
+logging('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
+       args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
+
+model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
+if args.clamp_len > 0:
+    model.clamp_len = args.clamp_len
+if args.same_length:
+    model.same_length = True
+
+###############################################################################
+# Evaluation code
+###############################################################################
+
+def evaluate(eval_iter):
+    model.eval()
+    total_len, total_loss = 0, 0.
+    start_time = time.time()
+    with torch.no_grad():
+        mems = tuple()
+        for idx, (data, target, seq_len) in enumerate(eval_iter):
+            ts = time.time()
+            ret = model(data,target,*mems)
+            loss, mems = ret[0], ret[1:]
+            loss = loss.mean()
+            total_loss += seq_len * loss.item()
+            total_len += seq_len
+            #print('eval_batch id: {} use time: {:.2f} ms '.format(idx, (time.time()-ts)*1000))
+        total_time = time.time() - start_time
+    logging('Time : {:.2f}s, FPS: {:.2f} characters/s'.format(
+            total_time, total_len*args.batch_size*args.eval_tgt_len/total_time))
+    return total_loss / total_len
+
+    
+# Run on test.py data.
+if args.split == 'all':
+    test_loss = evaluate(te_iter)
+    valid_loss = evaluate(va_iter)
+elif args.split == 'valid':
+    valid_loss = evaluate(va_iter)
+    test_loss = None
+elif args.split == 'test':
+    test_loss = evaluate(te_iter)
+    valid_loss = None
+
+def format_log(loss, split):
+    if args.dataset in ['enwik8', 'text8']:
+        log_str = '| {0} loss {1:5.2f} | {0} bpc {2:9.5f} '.format(
+            split, loss, loss / math.log(2))
+    else:
+        log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
+            split, loss, math.exp(loss))
+    return log_str
+
+log_str = ''
+if valid_loss is not None:
+    log_str += format_log(valid_loss, 'valid')
+if test_loss is not None:
+    log_str += format_log(test_loss, 'test.py')
+
+logging('=' * 100)
+logging(log_str)
+logging('=' * 100)
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/getdata.sh b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/getdata.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8043d1a1aa96cc492928aed06d34a0c4ed6f0f6c
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/getdata.sh
@@ -0,0 +1,18 @@
+echo "=== Acquiring datasets ==="
+echo "---"
+
+mkdir -p data
+cd data
+
+echo "- Downloading enwik8 (Character)"
+if [[ ! -d 'enwik8' ]]; then
+    mkdir -p enwik8
+    cd enwik8
+    wget --continue http://mattmahoney.net/dc/enwik8.zip
+    wget https://raw.githubusercontent.com/salesforce/awd-lstm-lm/master/data/enwik8/prep_enwik8.py
+    python3 prep_enwik8.py
+    cd ..
+fi
+
+echo "---"
+echo "Happy language modeling :)"
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/mem_transformer.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/mem_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..92a55122f9fec99dc3a32cd6c38fcc9ca008625a
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/mem_transformer.py
@@ -0,0 +1,851 @@
+import sys
+import math
+import functools
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+sys.path.append('utils')
+from utils.proj_adaptive_softmax import ProjectedAdaptiveLogSoftmax
+from utils.log_uniform_sampler import LogUniformSampler, sample_logits
+
+class PositionalEmbedding(nn.Module):
+    def __init__(self, demb):
+        super(PositionalEmbedding, self).__init__()
+
+        self.demb = demb
+
+        inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, pos_seq, bsz=None):
+        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+
+        if bsz is not None:
+            return pos_emb[:,None,:].expand(-1, bsz, -1)
+        else:
+            return pos_emb[:,None,:]
+
+
+class PositionwiseFF(nn.Module):
+    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False):
+        super(PositionwiseFF, self).__init__()
+
+        self.d_model = d_model
+        self.d_inner = d_inner
+        self.dropout = dropout
+
+        self.CoreNet = nn.Sequential(
+            nn.Linear(d_model, d_inner), nn.ReLU(inplace=True),
+            nn.Dropout(dropout),
+            nn.Linear(d_inner, d_model),
+            nn.Dropout(dropout),
+        )
+
+        self.layer_norm = nn.LayerNorm(d_model)
+
+        self.pre_lnorm = pre_lnorm
+
+    def forward(self, inp):
+        if self.pre_lnorm:
+            ##### layer normalization + positionwise feed-forward
+            core_out = self.CoreNet(self.layer_norm(inp))
+
+            ##### residual connection
+            output = core_out + inp
+        else:
+            ##### positionwise feed-forward
+            core_out = self.CoreNet(inp)
+
+            ##### residual connection + layer normalization
+            output = self.layer_norm(inp + core_out)
+
+            # output = self.layer_norm((inp + core_out).squeeze())
+            # output = output.unsqueeze(1)
+
+        return output
+
+class MultiHeadAttn(nn.Module):
+    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, 
+                 pre_lnorm=False):
+        super(MultiHeadAttn, self).__init__()
+
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.dropout = dropout
+
+        self.q_net = nn.Linear(d_model, n_head * d_head, bias=False)
+        self.kv_net = nn.Linear(d_model, 2 * n_head * d_head, bias=False)
+
+        self.drop = nn.Dropout(dropout)
+        self.dropatt = nn.Dropout(dropatt)
+        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
+
+        self.layer_norm = nn.LayerNorm(d_model)
+
+        self.scale = 1 / (d_head ** 0.5)
+
+        self.pre_lnorm = pre_lnorm
+
+    def forward(self, h, attn_mask=None, mems=None):
+        ##### multihead attention
+        # [hlen x bsz x n_head x d_head]
+
+        if mems is not None:
+            c = torch.cat([mems, h], 0)
+        else:
+            c = h
+
+        if self.pre_lnorm:
+            ##### layer normalization
+            c = self.layer_norm(c)
+
+        head_q = self.q_net(h)
+        head_k, head_v = torch.chunk(self.kv_net(c), 2, -1)
+
+        head_q = head_q.view(h.size(0), h.size(1), self.n_head, self.d_head)
+        head_k = head_k.view(c.size(0), c.size(1), self.n_head, self.d_head)
+        head_v = head_v.view(c.size(0), c.size(1), self.n_head, self.d_head)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = torch.einsum('ibnd,jbnd->ijbn', (head_q, head_k))
+        attn_score.mul_(self.scale)
+        if attn_mask is not None and attn_mask.any().item():
+            if attn_mask.dim() == 2:
+                attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
+            elif attn_mask.dim() == 3:
+                attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf'))
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = F.softmax(attn_score, dim=1)
+        attn_prob = self.dropatt(attn_prob)
+
+        # [qlen x klen x bsz x n_head] + [klen x bsz x n_head x d_head] -> [qlen x bsz x n_head x d_head]
+        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, head_v))
+        attn_vec = attn_vec.contiguous().view(
+            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+
+        ##### linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            ##### residual connection
+            output = h + attn_out
+        else:
+            ##### residual connection + layer normalization
+            output = self.layer_norm(h + attn_out)
+
+        return output
+
+class RelMultiHeadAttn(nn.Module):
+    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
+                 tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False):
+        super(RelMultiHeadAttn, self).__init__()
+
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.dropout = dropout
+
+        self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False)
+
+        self.drop = nn.Dropout(dropout)
+        self.dropatt = nn.Dropout(dropatt)
+        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
+
+        self.layer_norm = nn.LayerNorm(d_model)
+
+        self.scale = 1 / (d_head ** 0.5)
+
+        self.pre_lnorm = pre_lnorm
+
+    def _parallelogram_mask(self, h, w, left=False):
+        mask = torch.ones((h, w)).byte()
+        m = min(h, w)
+        mask[:m,:m] = torch.triu(mask[:m,:m])
+        mask[-m:,-m:] = torch.tril(mask[-m:,-m:])
+
+        if left:
+            return mask
+        else:
+            return mask.flip(0)
+
+    def _shift(self, x, qlen, klen, mask, left=False):
+        if qlen > 1:
+            zero_pad = torch.zeros((x.size(0), qlen-1, x.size(2), x.size(3)),
+                                    device=x.device, dtype=x.dtype)
+        else:
+            zero_pad = torch.zeros(0, device=x.device, dtype=x.dtype)
+
+        if left:
+            mask = mask.flip(1)
+            x_padded = torch.cat([zero_pad, x], dim=1).expand(qlen, -1, -1, -1)
+        else:
+            x_padded = torch.cat([x, zero_pad], dim=1).expand(qlen, -1, -1, -1)
+
+        x = x_padded.masked_select(mask[:,:,None,None]) \
+                    .view(qlen, klen, x.size(2), x.size(3))
+
+        return x
+
+    def _rel_shift(self, x, zero_triu=False):
+        zero_pad = torch.zeros((x.size(0), 1, *x.size()[2:]),
+                               device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=1)
+
+        x_padded = x_padded.view(x.size(1) + 1, x.size(0), *x.size()[2:])
+
+        x = x_padded[1:].view_as(x)
+
+        if zero_triu:
+            ones = torch.ones((x.size(0), x.size(1)))
+            x = x * torch.tril(ones, x.size(1) - x.size(0))[:,:,None,None]
+
+        return x
+
+    def forward(self, w, r, attn_mask=None, mems=None):
+
+        raise NotImplementedError
+
+class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
+    def __init__(self, *args, **kwargs):
+        super(RelPartialLearnableMultiHeadAttn, self).__init__(*args, **kwargs)
+
+        self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)
+
+    def forward(self, w, r, r_w_bias, r_r_bias, attn_mask=None, mems=None):
+        qlen, rlen, bsz = w.size(0), r.size(0), w.size(1)
+        if mems is not None:
+            cat = torch.cat([mems, w], 0)
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(cat))
+            else:
+                w_heads = self.qkv_net(cat)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+            w_head_q = w_head_q[-qlen:]
+        else:
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(w))
+            else:
+                w_heads = self.qkv_net(w)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+
+        klen = w_head_k.size(0)
+
+        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
+        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
+        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
+
+        r_head_k = r_head_k.view(rlen, self.n_head, self.d_head)                # qlen x n_head x d_head
+
+        #### compute attention score
+        rw_head_q = w_head_q + r_w_bias      # qlen x bsz x n_head x d_head
+
+        AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k))             # qlen x klen x bsz x n_head
+
+        rr_head_q = w_head_q + r_r_bias
+        BD = torch.einsum('ibnd,jnd->ijbn', (rr_head_q, r_head_k))              # qlen x klen x bsz x n_head
+        BD = self._rel_shift(BD)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = AC + BD
+        attn_score.mul_(self.scale)
+
+        #### compute attention probability
+        attn_mask_bool = attn_mask.bool()
+        if attn_mask is not None and attn_mask_bool.any().item():
+            if attn_mask.dim() == 2:
+                attn_score.masked_fill_(attn_mask[None, :, :, None], -float('inf')).bool().type_as(attn_score)
+            elif attn_mask.dim() == 3:
+                attn_score.masked_fill_(attn_mask[:, :, :, None], -float('inf')).bool().type_as(attn_score)
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = F.softmax(attn_score, dim=1)
+        attn_prob = self.dropatt(attn_prob)
+
+        #### compute attention vector
+        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))
+
+        # [qlen x bsz x n_head x d_head]
+        attn_vec = attn_vec.contiguous().view(
+            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+
+        ##### linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            ##### residual connection
+            output = w + attn_out
+        else:
+            ##### residual connection + layer normalization
+            output = self.layer_norm(w + attn_out)
+
+        return output
+
+class RelLearnableMultiHeadAttn(RelMultiHeadAttn):
+    def __init__(self, *args, **kwargs):
+        super(RelLearnableMultiHeadAttn, self).__init__(*args, **kwargs)
+
+    def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None):
+        # r_emb: [klen, n_head, d_head], used for term B
+        # r_w_bias: [n_head, d_head], used for term C
+        # r_bias: [klen, n_head], used for term D
+
+        qlen, bsz = w.size(0), w.size(1)
+
+        if mems is not None:
+            cat = torch.cat([mems, w], 0)
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(cat))
+            else:
+                w_heads = self.qkv_net(cat)
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+
+            w_head_q = w_head_q[-qlen:]
+        else:
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(w))
+            else:
+                w_heads = self.qkv_net(w)
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+
+        klen = w_head_k.size(0)
+
+        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)
+        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)
+        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)
+
+        if klen > r_emb.size(0):
+            r_emb_pad = r_emb[0:1].expand(klen-r_emb.size(0), -1, -1)
+            r_emb = torch.cat([r_emb_pad, r_emb], 0)
+            r_bias_pad = r_bias[0:1].expand(klen-r_bias.size(0), -1)
+            r_bias = torch.cat([r_bias_pad, r_bias], 0)
+        else:
+            r_emb = r_emb[-klen:]
+            r_bias = r_bias[-klen:]
+
+        #### compute attention score
+        rw_head_q = w_head_q + r_w_bias[None]                                   # qlen x bsz x n_head x d_head
+
+        AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k))             # qlen x klen x bsz x n_head
+        B_ = torch.einsum('ibnd,jnd->ijbn', (w_head_q, r_emb))                  # qlen x klen x bsz x n_head
+        D_ = r_bias[None, :, None]                                              # 1    x klen x 1   x n_head
+        BD = self._rel_shift(B_ + D_)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = AC + BD
+        attn_score.mul_(self.scale)
+
+        #### compute attention probability
+        if attn_mask is not None and attn_mask.any().item():
+            if attn_mask.dim() == 2:
+                attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
+            elif attn_mask.dim() == 3:
+                attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf'))
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = F.softmax(attn_score, dim=1)
+        attn_prob = self.dropatt(attn_prob)
+
+        #### compute attention vector
+        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))
+
+        # [qlen x bsz x n_head x d_head]
+        attn_vec = attn_vec.contiguous().view(
+            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+
+        ##### linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            ##### residual connection
+            output = w + attn_out
+        else:
+            ##### residual connection + layer normalization
+            output = self.layer_norm(w + attn_out)
+
+        return output
+
+class DecoderLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout, **kwargs):
+        super(DecoderLayer, self).__init__()
+
+        self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+                                     pre_lnorm=kwargs.get('pre_lnorm'))
+
+    def forward(self, dec_inp, dec_attn_mask=None, mems=None):
+        output = self.dec_attn(dec_inp, attn_mask=dec_attn_mask,
+                               mems=mems)
+        output = self.pos_ff(output)
+
+        return output
+
+class RelLearnableDecoderLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout,
+                 **kwargs):
+        super(RelLearnableDecoderLayer, self).__init__()
+
+        self.dec_attn = RelLearnableMultiHeadAttn(n_head, d_model, d_head, dropout,
+                                         **kwargs)
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+                                     pre_lnorm=kwargs.get('pre_lnorm'))
+
+    def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None):
+        output = self.dec_attn(dec_inp, r_emb, r_w_bias, r_bias,
+                               attn_mask=dec_attn_mask,
+                               mems=mems)
+        output = self.pos_ff(output)
+
+        return output
+
+class RelPartialLearnableDecoderLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout,
+                 **kwargs):
+        super(RelPartialLearnableDecoderLayer, self).__init__()
+
+        self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model,
+                            d_head, dropout, **kwargs)
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+                                     pre_lnorm=kwargs.get('pre_lnorm'))
+
+    def forward(self, dec_inp, r, r_w_bias, r_r_bias, dec_attn_mask=None, mems=None):
+        output = self.dec_attn(dec_inp, r, r_w_bias, r_r_bias,
+                               attn_mask=dec_attn_mask,
+                               mems=mems)
+        output = self.pos_ff(output)
+
+        return output
+
+
+class AdaptiveEmbedding(nn.Module):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, 
+                 sample_softmax=False):
+        super(AdaptiveEmbedding, self).__init__()
+
+        self.n_token = n_token
+        self.d_embed = d_embed
+
+        self.cutoffs = cutoffs + [n_token]
+        self.div_val = div_val
+        self.d_proj = d_proj
+
+        self.emb_scale = d_proj ** 0.5
+
+        self.cutoff_ends = [0] + self.cutoffs
+
+        self.emb_layers = nn.ModuleList()
+        self.emb_projs = nn.ParameterList()
+        if div_val == 1:
+            print("n_token:", n_token)
+            print("d_embed:", d_embed)
+            # self.emb_layers.append(
+            #     nn.Embedding(n_token, 512, sparse=sample_softmax>0)
+            # )
+            self.emb_layers.append(
+                nn.Embedding(n_token, d_embed, sparse=sample_softmax>0)
+            )
+            if d_proj != d_embed:
+                self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_embed)))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                d_emb_i = d_embed // (div_val ** i)
+                self.emb_layers.append(nn.Embedding(r_idx-l_idx, d_emb_i))
+                self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_emb_i)))
+
+    def forward(self, inp):
+        if self.div_val == 1:
+            embed = self.emb_layers[0](inp)
+            if self.d_proj != self.d_embed:
+                embed = F.linear(embed, self.emb_projs[0])
+        else:
+            param = next(self.parameters())
+            inp_flat = inp.view(-1)
+            emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], 
+                dtype=param.dtype, device=param.device)
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+
+                mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
+                indices_i = mask_i.nonzero().squeeze()
+
+                if indices_i.numel() == 0:
+                    continue
+
+                inp_i = inp_flat.index_select(0, indices_i) - l_idx
+                emb_i = self.emb_layers[i](inp_i)
+                emb_i = F.linear(emb_i, self.emb_projs[i])
+
+                emb_flat.index_copy_(0, indices_i, emb_i)
+
+            embed = emb_flat.view(*inp.size(), self.d_proj)
+
+        embed.mul_(self.emb_scale)
+
+        return embed
+
+class MemTransformerLM(nn.Module):
+    def __init__(self, n_token, n_layer, n_head, d_model, d_head, d_inner,
+                 dropout, dropatt, tie_weight=True, d_embed=None, 
+                 div_val=1, tie_projs=[False], pre_lnorm=False,
+                 tgt_len=None, ext_len=None, mem_len=None, 
+                 cutoffs=[], adapt_inp=False,
+                 same_length=False, attn_type=0, clamp_len=-1, 
+                 sample_softmax=-1):
+        super(MemTransformerLM, self).__init__()
+        self.n_token = n_token
+
+        d_embed = d_model if d_embed is None else d_embed
+        self.d_embed = d_embed
+        self.d_model = d_model
+        self.n_head = n_head
+        self.d_head = d_head
+        self.word_emb = AdaptiveEmbedding(n_token, d_embed, d_model, cutoffs,
+                                          div_val=div_val)
+        self.drop = nn.Dropout(dropout)
+
+        self.n_layer = n_layer
+
+        self.tgt_len = tgt_len
+        self.mem_len = mem_len
+        self.ext_len = ext_len
+        self.max_klen = tgt_len + ext_len + mem_len
+
+        self.attn_type = attn_type
+
+        self.layers = nn.ModuleList()
+        if attn_type == 0: # the default attention
+            for i in range(n_layer):
+                self.layers.append(
+                    RelPartialLearnableDecoderLayer(
+                        n_head, d_model, d_head, d_inner, dropout,
+                        tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len,
+                        dropatt=dropatt, pre_lnorm=pre_lnorm)
+                )
+        elif attn_type == 1: # learnable embeddings
+            for i in range(n_layer):
+                self.layers.append(
+                    RelLearnableDecoderLayer(
+                        n_head, d_model, d_head, d_inner, dropout,
+                        tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len,
+                        dropatt=dropatt, pre_lnorm=pre_lnorm)
+                )
+        elif attn_type in [2, 3]: # absolute embeddings
+            for i in range(n_layer):
+                self.layers.append(
+                    DecoderLayer(
+                        n_head, d_model, d_head, d_inner, dropout,
+                        dropatt=dropatt, pre_lnorm=pre_lnorm)
+                )
+
+        self.sample_softmax = sample_softmax
+        # use sampled softmax
+        if sample_softmax > 0:
+            self.out_layer = nn.Linear(d_model, n_token)
+            if tie_weight:
+                self.out_layer.weight = self.word_emb.weight
+            self.tie_weight = tie_weight
+            self.sampler = LogUniformSampler(n_token, sample_softmax)
+
+        # use adaptive softmax (including standard softmax)
+        else:
+            # dump_tensor(n_token, 'n_token.pt')
+            # dump_tensor(d_embed, 'd_embed.pt')
+            # dump_tensor(d_model, 'd_model.pt')
+            # dump_tensor(cutoffs, 'cutoffs.pt')
+            # dump_tensor(div_val, 'div_val.pt')
+
+            self.crit = ProjectedAdaptiveLogSoftmax(n_token, d_embed, d_model, 
+                                                    cutoffs, div_val=div_val)
+
+            if tie_weight:
+                for i in range(len(self.crit.out_layers)):
+                    self.crit.out_layers[i].weight = self.word_emb.emb_layers[i].weight
+
+            if tie_projs:
+                for i, tie_proj in enumerate(tie_projs):
+                    if tie_proj and div_val == 1 and d_model != d_embed:
+                        self.crit.out_projs[i] = self.word_emb.emb_projs[0]
+                    elif tie_proj and div_val != 1:
+                        self.crit.out_projs[i] = self.word_emb.emb_projs[i]
+
+        self.same_length = same_length
+        self.clamp_len = clamp_len
+
+        self._create_params()
+
+    def backward_compatible(self):
+        self.sample_softmax = -1
+
+    def _create_params(self):
+        if self.attn_type == 0: # default attention
+            self.pos_emb = PositionalEmbedding(self.d_model)
+            self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+            self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+        elif self.attn_type == 1: # learnable
+            self.r_emb = nn.Parameter(torch.Tensor(
+                    self.n_layer, self.max_klen, self.n_head, self.d_head))
+            self.r_w_bias = nn.Parameter(torch.Tensor(
+                    self.n_layer, self.n_head, self.d_head))
+            self.r_bias = nn.Parameter(torch.Tensor(
+                    self.n_layer, self.max_klen, self.n_head))
+        elif self.attn_type == 2: # absolute standard
+            self.pos_emb = PositionalEmbedding(self.d_model)
+        elif self.attn_type == 3: # absolute deeper SA
+            self.r_emb = nn.Parameter(torch.Tensor(
+                    self.n_layer, self.max_klen, self.n_head, self.d_head))
+
+    def reset_length(self, tgt_len, ext_len, mem_len):
+        self.tgt_len = tgt_len
+        self.mem_len = mem_len
+        self.ext_len = ext_len
+
+    def init_mems(self):
+        if self.mem_len > 0:
+            mems = []
+            param = next(self.parameters())
+            for i in range(self.n_layer+1):
+                empty = torch.empty(0, dtype=param.dtype, device=param.device)
+                mems.append(empty)
+
+            return mems
+        else:
+            return None
+
+    def _update_mems(self, hids, mems, qlen, mlen):
+        # does not deal with None
+        if mems is None: return None
+
+        # mems is not None
+        assert len(hids) == len(mems), 'len(hids) != len(mems)'
+
+        # There are `mlen + qlen` steps that can be cached into mems
+        # For the next step, the last `ext_len` of the `qlen` tokens
+        # will be used as the extended context. Hence, we only cache
+        # the tokens from `mlen + qlen - self.ext_len - self.mem_len`
+        # to `mlen + qlen - self.ext_len`.
+        with torch.no_grad():
+            new_mems = []
+            end_idx = mlen + max(0, qlen - 0 - self.ext_len)
+            beg_idx = max(0, end_idx - self.mem_len)
+            for i in range(len(hids)):
+
+                cat = torch.cat([mems[i], hids[i]], dim=0)
+                new_mems.append(cat[beg_idx:end_idx].detach())
+
+        return new_mems
+
+    def _forward(self, dec_inp, mems=None):
+        qlen, bsz = dec_inp.size()
+        word_emb = self.word_emb(dec_inp.long())
+
+        mlen = mems[0].size(0) if mems is not None else 0
+        klen = mlen + qlen
+        if self.same_length:
+            all_ones = word_emb.new_ones(qlen, klen)
+            mask_len = klen - self.mem_len
+            if mask_len > 0:
+                mask_shift_len = qlen - mask_len
+            else:
+                mask_shift_len = qlen
+            dec_attn_mask = (torch.triu(all_ones, 1+mlen)
+                    + torch.tril(all_ones, -mask_shift_len)).byte()[:, :, None] # -1
+        else:
+            dec_attn_mask = torch.triu(
+                word_emb.new_ones(qlen, klen), diagonal=1+mlen).byte()[:,:,None]
+
+        hids = []
+        if self.attn_type == 0: # default
+            pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device, 
+                                   dtype=word_emb.dtype)
+            if self.clamp_len > 0:
+                pos_seq.clamp_(max=self.clamp_len)
+            pos_emb = self.pos_emb(pos_seq)
+
+            core_out = self.drop(word_emb)
+            pos_emb = self.drop(pos_emb)
+
+            hids.append(core_out)
+            for i, layer in enumerate(self.layers):
+                mems_i = None if mems is None else mems[i]
+                core_out = layer(core_out, pos_emb, self.r_w_bias,
+                        self.r_r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i)
+                hids.append(core_out)
+        elif self.attn_type == 1: # learnable
+            core_out = self.drop(word_emb)
+            hids.append(core_out)
+            for i, layer in enumerate(self.layers):
+                if self.clamp_len > 0:
+                    r_emb = self.r_emb[i][-self.clamp_len :]
+                    r_bias = self.r_bias[i][-self.clamp_len :]
+                else:
+                    r_emb, r_bias = self.r_emb[i], self.r_bias[i]
+
+                mems_i = None if mems is None else mems[i]
+                core_out = layer(core_out, r_emb, self.r_w_bias[i],
+                        r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i)
+                hids.append(core_out)
+        elif self.attn_type == 2: # absolute
+            pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device,
+                                   dtype=word_emb.dtype)
+            if self.clamp_len > 0:
+                pos_seq.clamp_(max=self.clamp_len)
+            pos_emb = self.pos_emb(pos_seq)
+
+            core_out = self.drop(word_emb + pos_emb[-qlen:])
+
+            hids.append(core_out)
+            for i, layer in enumerate(self.layers):
+                mems_i = None if mems is None else mems[i]
+                if mems_i is not None and i == 0:
+                    mems_i += pos_emb[:mlen]
+                core_out = layer(core_out, dec_attn_mask=dec_attn_mask,
+                                 mems=mems_i)
+                hids.append(core_out)
+        elif self.attn_type == 3:
+            core_out = self.drop(word_emb)
+
+            hids.append(core_out)
+            for i, layer in enumerate(self.layers):
+                mems_i = None if mems is None else mems[i]
+                if mems_i is not None and mlen > 0:
+                    cur_emb = self.r_emb[i][:-qlen]
+                    cur_size = cur_emb.size(0)
+                    if cur_size < mlen:
+                        cur_emb_pad = cur_emb[0:1].expand(mlen-cur_size, -1, -1)
+                        cur_emb = torch.cat([cur_emb_pad, cur_emb], 0)
+                    else:
+                        cur_emb = cur_emb[-mlen:]
+                    mems_i += cur_emb.view(mlen, 1, -1)
+                core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1)
+
+                core_out = layer(core_out, dec_attn_mask=dec_attn_mask,
+                                 mems=mems_i)
+                hids.append(core_out)
+
+        core_out = self.drop(core_out)
+
+        new_mems = self._update_mems(hids, mems, mlen, qlen)
+
+        return core_out, new_mems
+
+    def forward(self, data, target, *mems):
+        # nn.DataParallel does not allow size(0) tensors to be broadcasted.
+        # So, have to initialize size(0) mems inside the model forward.
+        # Moreover, have to return new_mems to allow nn.DataParallel to piece
+        # them together.
+        if not mems: mems = self.init_mems()
+
+        tgt_len = target.size(0)
+        hidden, new_mems = self._forward(data, mems=mems)
+
+        pred_hid = hidden[-tgt_len:]
+        if self.sample_softmax > 0 and self.training:
+            assert self.tie_weight
+            logit = sample_logits(self.word_emb,
+                self.out_layer.bias, target, pred_hid, self.sampler)
+            loss = -F.log_softmax(logit, -1)[:, :, 0]
+        else:
+            loss = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target.view(-1))
+            loss = loss.view(tgt_len, -1)
+            loss = loss.npu()
+
+        if new_mems is None:
+            return [loss]
+        else:
+            return [loss] + new_mems
+
+def set_device(obj, device='cpu'):
+    if isinstance(obj, (tuple, list)):
+        dump = []
+        for item in obj:
+            dump.append(set_device(item, device))
+        return dump
+    elif isinstance(obj, dict):
+        dump = {}
+        for k, v in obj.items():
+            dump[k] = set_device(v, device)
+        return dump
+    elif isinstance(obj, torch.Tensor):
+        return obj.to(device)
+    else:
+        return obj
+
+
+def dump_tensor(output, name):
+    dump = set_device(output, 'cpu')
+    torch.save(dump, name)
+    print('%s dump success!' % (name))
+
+
+def load_tensor(name, device):
+    output = torch.load(name)
+    dump = set_device(output, device)
+    print('%s load success!' % (name))
+    return dump
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser(description='unit test')
+
+    parser.add_argument('--n_layer', type=int, default=4, help='')
+    parser.add_argument('--n_rel_layer', type=int, default=4, help='')
+    parser.add_argument('--n_head', type=int, default=2, help='')
+    parser.add_argument('--d_head', type=int, default=2, help='')
+    parser.add_argument('--d_model', type=int, default=200, help='')
+    parser.add_argument('--d_embed', type=int, default=200, help='')
+    parser.add_argument('--d_inner', type=int, default=200, help='')
+    parser.add_argument('--dropout', type=float, default=0.0, help='')
+    parser.add_argument('--cuda', action='store_true', help='')
+    parser.add_argument('--seed', type=int, default=1111, help='')
+    parser.add_argument('--multi_gpu', action='store_true', help='')
+
+    args = parser.parse_args()
+
+    #device = torch.device("cuda" if args.cuda else "cpu")
+    device = torch.device("npu:0")
+
+    B = 4
+    tgt_len, mem_len, ext_len = 36, 36, 0
+    data_len = tgt_len * 20
+    args.n_token = 10000
+
+    import data_utils
+
+    data = torch.LongTensor(data_len*B).random_(0, args.n_token).to(device)
+    diter = data_utils.LMOrderedIterator(data, B, tgt_len, device=device, ext_len=ext_len)
+
+    cutoffs = [args.n_token // 2]
+    tie_projs = [False] + [True] * len(cutoffs)
+
+    for div_val in [1, 2]:
+        for d_embed in [200, 100]:
+            model = MemTransformerLM(args.n_token, args.n_layer, args.n_head,
+                            args.d_model, args.d_head, args.d_inner, args.dropout,
+                            dropatt=args.dropout, tie_weight=True, 
+                            d_embed=d_embed, div_val=div_val, 
+                            tie_projs=tie_projs, pre_lnorm=True,
+                            tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len, 
+                            cutoffs=cutoffs, attn_type=0).to(device)
+
+            print(sum(p.numel() for p in model.parameters()))
+
+            mems = tuple()
+            for idx, (inp, tgt, seqlen) in enumerate(diter):
+                print('batch {}'.format(idx))
+                out = model(inp, tgt, *mems)
+                mems = out[1:]
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/modelzoo_level.txt b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/modelzoo_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..82f29898a44e5414055c4a4dbb4f0998260f9809
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/modelzoo_level.txt
@@ -0,0 +1,3 @@
+FuncStatus:OK
+PerfStatus:POK
+PrecisionStatus:POK
\ No newline at end of file
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/requirements.txt b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7f4eaeb22a91cbb52bc225de73f755272ad3fe53
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/requirements.txt
@@ -0,0 +1,5 @@
+torchvision
+tqdm
+numpy
+itertools
+argparse
\ No newline at end of file
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/.keep b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/.keep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/env_npu.sh b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/env_npu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..280fca96da61c2d983f9a690fd0c044bbcba9167
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/env_npu.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+export install_path=/usr/local/Ascend
+
+if [ -d ${install_path}/toolkit ]; then
+    export LD_LIBRARY_PATH=/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH}
+    export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH
+    export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH
+    export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH
+    export ASCEND_OPP_PATH=${install_path}/opp
+else
+    if [ -d ${install_path}/nnae/latest ];then
+        export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/nnae/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/nnae/latest
+    else
+        export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest
+    fi
+fi
+
+
+#将Host日志输出到串口,0-关闭/1-开启
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+#设置默认日志级别,0-debug/1-info/2-warning/3-error
+export ASCEND_GLOBAL_LOG_LEVEL=3
+#设置Host侧Event日志开启标志,0-关闭/1-开启
+export ASCEND_GLOBAL_EVENT_ENABLE=0
+#设置是否开启taskque,0-关闭/1-开启
+export TASK_QUEUE_ENABLE=1
+#设置是否开启PTCopy,0-关闭/1-开启
+export PTCOPY_ENABLE=1
+#设置是否开启combined标志,0-关闭/1-开启
+export COMBINED_ENABLE=1
+#设置特殊场景是否需要重新编译,不需要修改
+export DYNAMIC_OP="ADD#MUL"
+#HCCL白名单开关,1-关闭/0-开启
+export HCCL_WHITELIST_DISABLE=1
+#设置Device侧日志等级为error
+${install_path}/driver/tools/msnpureport -g error
+#关闭Device侧Event日志
+${install_path}/driver/tools/msnpureport -e disable
+export BMMV2_ENABLE=1   
+
+path_lib=$(python3.7 -c """
+import sys
+import re
+result=''
+for index in range(len(sys.path)):
+    match_sit = re.search('-packages', sys.path[index])
+    if match_sit is not None:
+        match_lib = re.search('lib', sys.path[index])
+
+        if match_lib is not None:
+            end=match_lib.span()[1]
+            result += sys.path[index][0:end] + ':'
+
+        result+=sys.path[index] + '/torch/lib:'
+print(result)"""
+)
+
+echo ${path_lib}
+
+export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/usr/local/gcc7.3.0/lib64:${LD_LIBRARY_PATH}
\ No newline at end of file
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_eval_1p.sh b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_eval_1p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9b97a2636da9102d40e867388137a1c15d22a4c0
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_eval_1p.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="Transformer_XL_for_PyTorch"
+# 训练batch_size
+batch_size=22
+# 训练使用的npu卡数
+export RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path=""
+# checkpoint文件路径，以实际路径为准
+pth_path=""
+# 训练epoch
+train_epochs=50
+# 指定训练所使用的npu device卡id
+device_id=0
+# 加载数据进程数
+workers=128
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --device_id* ]];then
+        device_id=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改
+if [ $ASCEND_DEVICE_ID ];then
+    echo "device id is ${ASCEND_DEVICE_ID}"
+elif [ ${device_id} ];then
+    export ASCEND_DEVICE_ID=${device_id}
+    echo "device id is ${ASCEND_DEVICE_ID}"
+else
+    "[Error] device id must be config"
+    exit 1
+fi
+
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_dirname=${cur_path##*/}
+if [ x"${cur_path_last_dirname}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+python3 -u eval_npu.py  --split valid \
+       --data=${data_path} \
+       --pth=${pth_path} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/Eval_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a 'Eval'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/Eval_${ASCEND_DEVICE_ID}.log|awk -F "|" '{print $NF}'|awk -F " " '{print $NF}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Train  bpc : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+
+#最后一个迭代loss值，不需要修改
+#ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/Eval_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_full_8p.sh b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_full_8p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f77b282e9aa79914f9737735ce722d58f8710262
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_full_8p.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="Transformer_XL_for_PyTorch"
+# 训练batch_size
+batch_size=22
+# 训练使用的npu卡数
+export RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch
+train_epochs=50
+# 学习率
+learning_rate=0.00025
+# 加载数据进程数
+workers=124
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --workers* ]];then
+        workers=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+device_id_list=0,1,2,3,4,5,6,7
+export RANK_SIZE=8
+currentDir=$(cd "$(dirname "$0")";pwd)
+KERNEL_NUM=$(($(nproc)/8))
+for i in $(seq 0 7)
+do
+    PID_START=$((KERNEL_NUM * i))
+    PID_END=$((PID_START + KERNEL_NUM - 1))
+    taskset -c $PID_START-$PID_END python3.7 -u $(dirname $currentDir)/train_8p_npu.py \
+        --addr=$(hostname -I |awk '{print $1}') \
+        --workers=$(nproc) \
+        --data=${data_path} \
+        --multiprocessing-distributed \
+        --dist-url='tcp://127.0.0.1:50000' \
+        --dist-backend='hccl' \
+        --world-size=1 \
+        --device_num=8 \
+        --max_step=400000 \
+        --rank=0 \
+        --device-list=${device_id_list} \
+        --local_rank=$i  > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+done
+
+
+wait
+
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -a 'fps'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $20}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Performance characters/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a 'Eval'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "|" '{print $NF}'|awk -F " " '{print $NF}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Train bpc : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep epoch ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v test|awk -F "|" '{print $6}' | awk -F " " '{print $NF}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_performance_1p.sh b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_performance_1p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..18157a65d0efb238c9b696aab00bd74edf5bc672
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_performance_1p.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="Transformer_XL_for_PyTorch"
+# 训练batch_size
+batch_size=22
+# 训练使用的npu卡数
+export RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch
+train_epochs=1
+# 指定训练所使用的npu device卡id
+device_id=0
+# 加载数据进程数
+workers=128
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --device_id* ]];then
+        device_id=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改
+if [ $ASCEND_DEVICE_ID ];then
+    echo "device id is ${ASCEND_DEVICE_ID}"
+elif [ ${device_id} ];then
+    export ASCEND_DEVICE_ID=${device_id}
+    echo "device id is ${ASCEND_DEVICE_ID}"
+else
+    "[Error] device id must be config"
+    exit 1
+fi
+
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_dirname=${cur_path##*/}
+if [ x"${cur_path_last_dirname}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+python3.7 -u ./train_1p_npu.py \
+    --data=${data_path} \
+    --seed=1111 \
+    --workers=${workers} \
+    --gpu=${ASCEND_DEVICE_ID} \
+    --eval-interval=4000 \
+    --log-interval=1 \
+    --max_step=100 \
+    --epochs=${train_epochs} \
+    --static-loss-scale=128 \
+    --batch_size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+#FPS=`grep -a 'FPS'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $11}'|awk 'END {print}'`
+#FPS=${FPS#* }
+
+grep "fps" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk '{print $20}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${ASCEND_DEVICE_ID}_fps.log
+FPS=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}_fps.log | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'`
+
+#打印，不需要修改
+echo "Final Performance characters/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a 'bpc'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v test|awk -F "|" '{print $NF}'|awk -F " " '{print $NF}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Train  bpc : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000}'`
+
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep "epoch" ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v test|awk -F "|" '{print $6}' | awk -F " " '{print $NF}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+
+
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_performance_8p.sh b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_performance_8p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b1840ab3cb057206e47ce43487720cf7311434e4
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_performance_8p.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="Transformer_XL_for_PyTorch"
+# 训练batch_size
+batch_size=22
+# 训练使用的npu卡数
+export RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch
+train_epochs=50
+# 学习率
+learning_rate=0.00025
+# 加载数据进程数
+workers=124
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --workers* ]];then
+        workers=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+device_id_list=0,1,2,3,4,5,6,7
+export RANK_SIZE=8
+currentDir=$(cd "$(dirname "$0")";pwd)
+KERNEL_NUM=$(($(nproc)/8))
+for i in $(seq 0 7)
+do
+    PID_START=$((KERNEL_NUM * i))
+    PID_END=$((PID_START + KERNEL_NUM - 1))
+    taskset -c $PID_START-$PID_END python3.7 -u $(dirname $currentDir)/train_8p_npu.py \
+        --addr=$(hostname -I |awk '{print $1}') \
+        --workers=$(nproc) \
+        --data=${data_path} \
+        --multiprocessing-distributed \
+        --dist-url='tcp://127.0.0.1:50000' \
+        --dist-backend='hccl' \
+        --world-size=1 \
+        --device_num=8 \
+        --log-interval=1 \
+        --eval-interval=4000 \
+        --max_step=100 \
+        --rank=0 \
+        --device-list=${device_id_list} \
+        --local_rank=$i  > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+done
+
+
+wait
+
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+grep "fps" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk '{print $20}' > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}_fps.log
+FPS=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}_fps.log|awk '{a+=$1} END {if (NR !=0) printf("%.3f", a/NR)}'`
+#打印，不需要修改
+echo "Final Performance characters/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a 'bpc'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v test|awk 'END {print}'|awk -F "|" '{print $NF}'|awk -F " " '{print $NF}'`
+
+#打印，不需要修改
+echo "Final Train bpc : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep "epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_$ASCEND_DEVICE_ID.log|grep -v test|awk -F "|" '{print $6}' | awk -F " " '{print $NF}' >  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/train_1p_npu.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/train_1p_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f213a8237389c9d4942c7bc2ce1d172923850cb
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/train_1p_npu.py
@@ -0,0 +1,542 @@
+# coding: UTF-8
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import time
+import math
+import os
+import itertools
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+from data_utils import get_lm_corpus
+from mem_transformer import MemTransformerLM
+from utils.exp_utils import create_exp_dir
+from utils.data_parallel import BalancedDataParallel
+from apex import amp
+import apex
+
+parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
+parser.add_argument('--data', type=str, default='../data/enwik8',
+                    help='location of the data corpus')
+parser.add_argument('--dataset', type=str, default='enwik8',
+                    choices=['wt103', 'lm1b', 'enwik8', 'text8'],
+                    help='dataset name')
+parser.add_argument('--n_layer', type=int, default=12,
+                    help='number of total layers')
+parser.add_argument('--n_head', type=int, default=8,
+                    help='number of heads')
+parser.add_argument('--d_head', type=int, default=64,
+                    help='head dimension')
+parser.add_argument('--d_embed', type=int, default=-1,
+                    help='embedding dimension')
+parser.add_argument('--d_model', type=int, default=512,
+                    help='model dimension')
+parser.add_argument('--d_inner', type=int, default=2048,
+                    help='inner dimension in FF')
+parser.add_argument('--dropout', type=float, default=0.1,
+                    help='global dropout rate')
+parser.add_argument('--dropatt', type=float, default=0.0,
+                    help='attention probability dropout rate')
+parser.add_argument('--init', default='normal', type=str,
+                    help='parameter initializer to use.')
+parser.add_argument('--emb_init', default='normal', type=str,
+                    help='parameter initializer to use.')
+parser.add_argument('--init_range', type=float, default=0.1,
+                    help='parameters initialized by U(-init_range, init_range)')
+parser.add_argument('--emb_init_range', type=float, default=0.01,
+                    help='parameters initialized by U(-init_range, init_range)')
+parser.add_argument('--init_std', type=float, default=0.02,
+                    help='parameters initialized by N(0, init_std)')
+parser.add_argument('--proj_init_std', type=float, default=0.01,
+                    help='parameters initialized by N(0, init_std)')
+parser.add_argument('--optim', default='adam', type=str,
+                    choices=['adam', 'sgd', 'adagrad'],
+                    help='optimizer to use.')
+parser.add_argument('--lr', type=float, default=0.00025,
+                    help='initial learning rate (0.00025|5 for adam|sgd)')
+parser.add_argument('--epochs', type=int, default=50,
+                    help='train epochs')
+parser.add_argument('--mom', type=float, default=0.0,
+                    help='momentum for sgd')
+parser.add_argument('--scheduler', default='cosine', type=str,
+                    choices=['cosine', 'inv_sqrt', 'dev_perf', 'constant'],
+                    help='lr scheduler to use.')
+parser.add_argument('--warmup_step', type=int, default=0,
+                    help='upper epoch limit')
+parser.add_argument('--decay_rate', type=float, default=0.5,
+                    help='decay factor when ReduceLROnPlateau is used')
+parser.add_argument('--lr_min', type=float, default=0.0,
+                    help='minimum learning rate during annealing')
+parser.add_argument('--clip', type=float, default=0.25,
+                    help='gradient clipping')
+parser.add_argument('--clip_nonemb', action='store_true',
+                    help='only clip the gradient of non-embedding params')
+parser.add_argument('--max_step', type=int, default=100000,
+                    help='upper epoch limit')
+parser.add_argument('--batch_size', type=int, default=22,
+                    help='batch size')
+parser.add_argument('--batch_chunk', type=int, default=1,
+                    help='split batch into chunks to save memory')
+parser.add_argument('--tgt_len', type=int, default=512,
+                    help='number of tokens to predict')
+parser.add_argument('--eval_tgt_len', type=int, default=128,
+                    help='number of tokens to predict for evaluation')
+parser.add_argument('--ext_len', type=int, default=0,
+                    help='length of the extended context')
+parser.add_argument('--mem_len', type=int, default=512,
+                    help='length of the retained previous heads')
+parser.add_argument('--not_tied', action='store_true',
+                    help='do not tie the word embedding and softmax weights')
+parser.add_argument('--seed', type=int, default=1111,
+                    help='random seed')
+parser.add_argument('--workers', type=int, default=64,
+                    help='workers num')
+parser.add_argument('--npu', default=True, help='use NPU')
+parser.add_argument('--adaptive', action='store_true',
+                    help='use adaptive softmax')
+parser.add_argument('--div_val', type=int, default=1,
+                    help='divident value for adapative input and softmax')
+parser.add_argument('--pre_lnorm', action='store_true',
+                    help='apply LayerNorm to the input instead of the output')
+parser.add_argument('--varlen', action='store_true',
+                    help='use variable length')
+parser.add_argument('--multi_gpu', action='store_true',
+                    help='use multiple GPU')
+parser.add_argument('--log-interval', type=int, default=200,
+                    help='report interval')
+parser.add_argument('--eval-interval', type=int, default=4000,
+                    help='evaluation interval')
+parser.add_argument('--work_dir', default='LM-TFM', type=str,
+                    help='experiment directory.')
+parser.add_argument('--restart', action='store_true',
+                    help='restart training from the saved checkpoint')
+parser.add_argument('--restart_dir', type=str, default='',
+                    help='restart dir')
+parser.add_argument('--debug', action='store_true',
+                    help='run in debug mode (do not create exp dir)')
+parser.add_argument('--same_length', action='store_true',
+                    help='use the same attn length for all tokens')
+parser.add_argument('--attn_type', type=int, default=0,
+                    help='attention type. 0 for ours, 1 for Shaw et al,'
+                    '2 for Vaswani et al, 3 for Al Rfou et al.')
+parser.add_argument('--clamp_len', type=int, default=-1,
+                    help='use the same pos embeddings after clamp_len')
+parser.add_argument('--eta_min', type=float, default=0.0,
+                    help='min learning rate for cosine scheduler')
+parser.add_argument('--gpu0_bsz', type=int, default=-1,
+                    help='batch size on gpu 0')
+parser.add_argument('--max_eval_steps', type=int, default=-1,
+                    help='max eval steps')
+parser.add_argument('--sample_softmax', type=int, default=-1,
+                    help='number of samples in sampled softmax')
+parser.add_argument('--patience', type=int, default=0,
+                    help='patience')
+parser.add_argument('--finetune_v2', action='store_true',
+                    help='finetune v2')
+parser.add_argument('--finetune_v3', action='store_true',
+                    help='finetune v3')
+parser.add_argument('--static-loss-scale', type=float, default=128.0,
+                    help='Static loss scale, positive power of 2 values can '
+                    'improve fp16 convergence.')
+parser.add_argument('--dynamic-loss-scale', action='store_true',
+                    help='Use dynamic loss scaling.  If supplied, this argument'
+                    ' supersedes --static-loss-scale.')
+args = parser.parse_args()
+args.tied = not args.not_tied
+
+if args.d_embed < 0:
+    args.d_embed = args.d_model
+
+assert args.ext_len >= 0, 'extended context length must be non-negative'
+assert args.batch_size % args.batch_chunk == 0
+
+args.work_dir = '{}-{}'.format(args.work_dir, args.dataset)
+args.work_dir = os.path.join(args.work_dir, time.strftime('%Y%m%d-%H%M%S'))
+logging = create_exp_dir(args.work_dir,
+    scripts_to_save=['train.py', 'mem_transformer.py'], debug=args.debug)
+
+device = torch.device('npu:0')
+
+###############################################################################
+# Load data
+###############################################################################
+corpus = get_lm_corpus(args.data, args.dataset)
+ntokens = len(corpus.vocab)
+args.n_token = ntokens
+
+eval_batch_size = 10
+tr_iter = corpus.get_iterator('train', args.batch_size, args.tgt_len,
+    device=device, ext_len=args.ext_len)
+va_iter = corpus.get_iterator('valid', eval_batch_size, args.eval_tgt_len,
+    device=device, ext_len=args.ext_len)
+te_iter = corpus.get_iterator('test.py', eval_batch_size, args.eval_tgt_len,
+    device=device, ext_len=args.ext_len)
+
+# adaptive softmax / embedding
+cutoffs, tie_projs = [], [False]
+if args.adaptive:
+    assert args.dataset in ['wt103', 'lm1b']
+    if args.dataset == 'wt103':
+        cutoffs = [20000, 40000, 200000]
+        tie_projs += [True] * len(cutoffs)
+    elif args.dataset == 'lm1b':
+        cutoffs = [60000, 100000, 640000]
+        tie_projs += [False] * len(cutoffs)
+
+###############################################################################
+# Build the model
+###############################################################################
+def init_weight(weight):
+    if args.init == 'uniform':
+        nn.init.uniform_(weight, -args.init_range, args.init_range)
+    elif args.init == 'normal':
+        nn.init.normal_(weight, 0.0, args.init_std)
+
+def init_bias(bias):
+    nn.init.constant_(bias, 0.0)
+
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Linear') != -1:
+        if hasattr(m, 'weight') and m.weight is not None:
+            init_weight(m.weight)
+        if hasattr(m, 'bias') and m.bias is not None:
+            init_bias(m.bias)
+    elif classname.find('AdaptiveEmbedding') != -1:
+        if hasattr(m, 'emb_projs'):
+            for i in range(len(m.emb_projs)):
+                if m.emb_projs[i] is not None:
+                    nn.init.normal_(m.emb_projs[i], 0.0, args.proj_init_std)
+    elif classname.find('Embedding') != -1:
+        if hasattr(m, 'weight'):
+            init_weight(m.weight)
+    elif classname.find('ProjectedAdaptiveLogSoftmax') != -1:
+        if hasattr(m, 'cluster_weight') and m.cluster_weight is not None:
+            init_weight(m.cluster_weight)
+        if hasattr(m, 'cluster_bias') and m.cluster_bias is not None:
+            init_bias(m.cluster_bias)
+        if hasattr(m, 'out_projs'):
+            for i in range(len(m.out_projs)):
+                if m.out_projs[i] is not None:
+                    nn.init.normal_(m.out_projs[i], 0.0, args.proj_init_std)
+    elif classname.find('LayerNorm') != -1:
+        if hasattr(m, 'weight'):
+            nn.init.normal_(m.weight, 1.0, args.init_std)
+        if hasattr(m, 'bias') and m.bias is not None:
+            init_bias(m.bias)
+    elif classname.find('TransformerLM') != -1:
+        if hasattr(m, 'r_emb'):
+            init_weight(m.r_emb)
+        if hasattr(m, 'r_w_bias'):
+            init_weight(m.r_w_bias)
+        if hasattr(m, 'r_r_bias'):
+            init_weight(m.r_r_bias)
+        if hasattr(m, 'r_bias'):
+            init_bias(m.r_bias)
+
+def update_dropout(m):
+    classname = m.__class__.__name__
+    if classname.find('Dropout') != -1:
+        if hasattr(m, 'p'):
+            m.p = args.dropout
+
+def update_dropatt(m):
+    if hasattr(m, 'dropatt'):
+        m.dropatt.p = args.dropatt
+
+if args.restart:
+    with open(os.path.join(args.restart_dir, 'model.pt'), 'rb') as f:
+        model = torch.load(f)
+    model.apply(update_dropout)
+    model.apply(update_dropatt)
+else:
+    model = MemTransformerLM(ntokens, args.n_layer, args.n_head, args.d_model,
+        args.d_head, args.d_inner, args.dropout, args.dropatt,
+        tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val,
+        tie_projs=tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len,
+        ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs,
+        same_length=args.same_length, attn_type=args.attn_type,
+        clamp_len=args.clamp_len, sample_softmax=args.sample_softmax)
+    model.apply(weights_init)
+    model.word_emb.apply(weights_init) # ensure embedding init is not overridden by out_layer in case of weight sharing
+args.n_all_param = sum([p.nelement() for p in model.parameters()])
+args.n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()])
+
+    
+if args.multi_gpu:
+    model = model.to(device)
+    if args.gpu0_bsz >= 0:
+        para_model = BalancedDataParallel(args.gpu0_bsz // args.batch_chunk,
+                                          model, dim=1).to(device)
+    else:
+        para_model = nn.DataParallel(model, dim=1).to(device)
+else:
+    para_model = model.to(device)
+
+#### optimizer
+if args.optim.lower() == 'sgd':
+    if args.sample_softmax > 0:
+        dense_params, sparse_params = [], []
+        for param in model.parameters():
+            if param.size() == model.word_emb.weight.size():
+                sparse_params.append(param)
+            else:
+                dense_params.append(param)
+        optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2)
+        optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom)
+    else:
+        optimizer = optim.SGD(model.parameters(), lr=args.lr,
+            momentum=args.mom)
+elif args.optim.lower() == 'adam':
+    if args.sample_softmax > 0:
+        dense_params, sparse_params = [], []
+        for param in model.parameters():
+            if param.size() == model.word_emb.weight.size():
+                sparse_params.append(param)
+            else:
+                dense_params.append(param)
+        optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr)
+        optimizer = optim.Adam(dense_params, lr=args.lr)
+    else:
+        optimizer = apex.optimizers.NpuFusedAdam(model.parameters(), lr=args.lr)
+elif args.optim.lower() == 'adagrad':
+    optimizer = optim.Adagrad(model.parameters(), lr=args.lr)
+
+
+###################################################################################################
+opt_level = "O2"
+model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level, loss_scale=128.0, combine_grad=True)
+###################################################################################################
+
+
+#### scheduler
+if args.scheduler == 'cosine':
+    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
+        args.max_step, eta_min=args.eta_min)
+    if args.sample_softmax > 0:
+        scheduler_sparse = optim.lr_scheduler.CosineAnnealingLR(optimizer_sparse,
+            args.max_step, eta_min=args.eta_min)
+elif args.scheduler == 'inv_sqrt':
+
+    def lr_lambda(step):
+        if step == 0 and args.warmup_step == 0:
+            return 1.
+        else:
+            return 1. / (step ** 0.5) if step > args.warmup_step \
+                   else step / (args.warmup_step ** 1.5)
+    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
+
+elif args.scheduler == 'dev_perf':
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
+        factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min)
+    if args.sample_softmax > 0:
+        scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau(optimizer_sparse,
+            factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min)
+elif args.scheduler == 'constant':
+    pass
+
+
+if args.restart:
+    if os.path.exists(os.path.join(args.restart_dir, 'optimizer.pt')):
+        with open(os.path.join(args.restart_dir, 'optimizer.pt'), 'rb') as f:
+            opt_state_dict = torch.load(f)
+            optimizer.load_state_dict(opt_state_dict)
+    else:
+        print('Optimizer was not saved. Start from scratch.')
+
+logging('=' * 100)
+for k, v in args.__dict__.items():
+    logging('    - {} : {}'.format(k, v))
+logging('=' * 100)
+logging('#params = {}'.format(args.n_all_param))
+logging('#non emb params = {}'.format(args.n_nonemb_param))
+
+###############################################################################
+# Training code
+###############################################################################
+
+def evaluate(eval_iter):
+    model.eval()
+    if args.mem_len == 0:
+        model.reset_length(args.eval_tgt_len,
+            args.ext_len+args.tgt_len-args.eval_tgt_len, args.mem_len)
+    else:
+        model.reset_length(args.eval_tgt_len,
+            args.ext_len, args.mem_len+args.tgt_len-args.eval_tgt_len)
+
+    # Evaluation
+    total_len, total_loss = 0, 0.
+    with torch.no_grad():
+        mems = tuple()
+        for i, (data, target, seq_len) in enumerate(eval_iter):
+            if args.max_eval_steps > 0 and i >= args.max_eval_steps:
+                break
+            ret = model(data, target, *mems)
+            loss, mems = ret[0], ret[1:]
+            loss = loss.mean()
+            total_loss += seq_len * loss.float().item()
+            total_len += seq_len
+
+    model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
+    model.train()
+
+    return total_loss / total_len
+
+
+def train():
+    global train_step, train_loss, best_val_loss, eval_start_time, log_start_time
+    model.train()
+    if args.batch_chunk > 1:
+        mems = [tuple() for _ in range(args.batch_chunk)]
+    else:
+        mems = tuple()
+    train_iter = tr_iter.get_varlen_iter() if args.varlen else tr_iter
+    for batch, (data, target, seq_len) in enumerate(train_iter):
+        model.zero_grad()
+        if args.batch_chunk > 1:
+            data_chunks = torch.chunk(data, args.batch_chunk, 1)
+            target_chunks = torch.chunk(target, args.batch_chunk, 1)
+            for i in range(args.batch_chunk):
+                data_i = data_chunks[i].contiguous()
+                target_i = target_chunks[i].contiguous()
+                ret = para_model(data_i, target_i, *mems[i])
+                loss, mems[i] = ret[0], ret[1:]
+                loss = loss.float().mean().type_as(loss) / args.batch_chunk
+                ####################################################################
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                ####################################################################
+                with torch.no_grad():
+                    train_loss += loss.float().bool().item()
+        else:
+           ret = para_model(data, target, *mems)
+           loss, mems = ret[0], ret[1:]
+           loss = loss.float().mean().type_as(loss)
+           ####################################################
+           with torch.no_grad():
+               train_loss += loss.float().item()
+           ###################################################################
+           with amp.scale_loss(loss, optimizer) as scaled_loss:
+               scaled_loss.backward()
+
+
+        optimizer.step()
+        if args.sample_softmax > 0:
+            optimizer_sparse.step()
+
+        # step-wise learning rate annealing
+        train_step += 1
+        if args.scheduler in ['cosine', 'constant', 'dev_perf']:
+            # linear warmup stage
+            if train_step < args.warmup_step:
+                curr_lr = args.lr * train_step / args.warmup_step
+                optimizer.param_groups[0]['lr'] = curr_lr
+                if args.sample_softmax > 0:
+                    optimizer_sparse.param_groups[0]['lr'] = curr_lr * 2
+            else:
+                if args.scheduler == 'cosine':
+                    scheduler.step(train_step)
+                    if args.sample_softmax > 0:
+                        scheduler_sparse.step(train_step)
+        elif args.scheduler == 'inv_sqrt':
+            scheduler.step(train_step)
+
+        if train_step % args.log_interval == 0:
+            cur_loss = train_loss / args.log_interval
+            elapsed = time.time() - log_start_time
+            log_str = '| epoch {:3d} step {:>8d} | {:>6d} batches | lr {:.3g} ' \
+                      '| ms/batch {:5.2f} | loss {:5.2f} | fps {:.2f}'.format(
+                epoch, train_step, batch+1, optimizer.param_groups[0]['lr'],
+                elapsed * 1000 / args.log_interval, cur_loss, args.log_interval*args.batch_size*args.tgt_len/elapsed)
+            if args.dataset in ['enwik8', 'text8']:
+                log_str += ' | bpc {:9.5f}'.format(cur_loss / math.log(2))
+            else:
+                log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss))
+            logging(log_str)
+            train_loss = 0
+            log_start_time = time.time()
+
+        if train_step % args.eval_interval == 0:
+            ts = time.time()
+            val_loss = evaluate(va_iter)
+            print('evaluation use time {} s'.format(time.time()-ts))
+            logging('-' * 100)
+            log_str = '| Eval {:3d} at step {:>8d} | time: {:5.2f}s ' \
+                      '| valid loss {:5.2f}'.format(
+                train_step // args.eval_interval, train_step,
+                (time.time() - eval_start_time), val_loss)
+            if args.dataset in ['enwik8', 'text8']:
+                log_str += ' | bpc {:9.5f}'.format(val_loss / math.log(2))
+            else:
+                log_str += ' | valid ppl {:9.3f}'.format(math.exp(val_loss))
+            logging(log_str)
+            logging('-' * 100)
+            # Save the model if the validation loss is the best we've seen so far.
+            if not best_val_loss or val_loss < best_val_loss:
+                if not args.debug:
+                    with open('model.pt', 'wb') as f:
+                        torch.save(model.state_dict(), f)
+                    with open('optimizer.pt', 'wb') as f:
+                        torch.save(optimizer.state_dict(), f)
+                best_val_loss = val_loss
+
+            # dev-performance based learning rate annealing
+            if args.scheduler == 'dev_perf':
+                scheduler.step(val_loss)
+                if args.sample_softmax > 0:
+                    scheduler_sparse.step(val_loss)
+
+            eval_start_time = time.time()
+
+        if train_step == args.max_step:
+            break
+
+# Loop over epochs.
+train_step = 0
+train_loss = 0
+best_val_loss = None
+
+log_start_time = time.time()
+eval_start_time = time.time()
+
+# At any point you can hit Ctrl + C to break out of training early.
+try:
+    for epoch in itertools.count(start=1):
+        train()
+        if train_step == args.max_step:
+            logging('-' * 100)
+            logging('End of training')
+            break
+except KeyboardInterrupt:
+    logging('-' * 100)
+    logging('Exiting from training early')
+
+## Load the best saved model.
+#with open('model.pt', 'rb') as f:
+#    model.load_state_dict(torch.load(f, map_location=device))
+#para_model = model.to(device)
+
+## Run on test data.
+#test_loss = evaluate(te_iter)
+#logging('=' * 100)
+#if args.dataset in ['enwik8', 'text8']:
+#    logging('| End of training | test loss {:5.2f} | test bpc {:9.5f}'.format(
+#        test_loss, test_loss / math.log(2)))
+#else:
+#    logging('| End of training | test loss {:5.2f} | test ppl {:9.3f}'.format(
+#        test_loss, math.exp(test_loss)))
+#logging('=' * 100)
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/train_8p_npu.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/train_8p_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb782e84f1777212f930e0741f409b6e376da96b
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/train_8p_npu.py
@@ -0,0 +1,644 @@
+# coding: UTF-8
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import time
+import math
+import os, sys
+import itertools
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.multiprocessing as mp
+from data_utils import get_lm_corpus
+from mem_transformer import MemTransformerLM
+from utils.exp_utils import create_exp_dir
+from utils.data_parallel import BalancedDataParallel
+from apex import amp
+import torch.distributed as dist
+import apex
+import warnings
+
+
+parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
+parser.add_argument('--data', type=str, default='../data/enwik8',
+                    help='location of the data corpus')
+parser.add_argument('--dataset', type=str, default='enwik8',
+                    choices=['wt103', 'lm1b', 'enwik8', 'text8'],
+                    help='dataset name')
+parser.add_argument('--n_layer', type=int, default=12,
+                    help='number of total layers')
+parser.add_argument('--n_head', type=int, default=8,
+                    help='number of heads')
+parser.add_argument('--d_head', type=int, default=64,
+                    help='head dimension')
+parser.add_argument('--d_embed', type=int, default=-1,
+                    help='embedding dimension')
+parser.add_argument('--d_model', type=int, default=512,
+                    help='model dimension')
+parser.add_argument('--d_inner', type=int, default=2048,
+                    help='inner dimension in FF')
+parser.add_argument('--dropout', type=float, default=0.1,
+                    help='global dropout rate')
+parser.add_argument('--dropatt', type=float, default=0.0,
+                    help='attention probability dropout rate')
+parser.add_argument('--init', default='normal', type=str,
+                    help='parameter initializer to use.')
+parser.add_argument('--emb_init', default='normal', type=str,
+                    help='parameter initializer to use.')
+parser.add_argument('--init_range', type=float, default=0.1,
+                    help='parameters initialized by U(-init_range, init_range)')
+parser.add_argument('--emb_init_range', type=float, default=0.01,
+                    help='parameters initialized by U(-init_range, init_range)')
+parser.add_argument('--init_std', type=float, default=0.02,
+                    help='parameters initialized by N(0, init_std)')
+parser.add_argument('--proj_init_std', type=float, default=0.01,
+                    help='parameters initialized by N(0, init_std)')
+parser.add_argument('--optim', default='adam', type=str,
+                    choices=['adam', 'sgd', 'adagrad'],
+                    help='optimizer to use.')
+parser.add_argument('--lr', type=float, default=0.00025,
+                    help='initial learning rate (0.00025|5 for adam|sgd)')
+parser.add_argument('--mom', type=float, default=0.0,
+                    help='momentum for sgd')
+parser.add_argument('--scheduler', default='cosine', type=str,
+                    choices=['cosine', 'inv_sqrt', 'dev_perf', 'constant'],
+                    help='lr scheduler to use.')
+parser.add_argument('--warmup_step', type=int, default=0,
+                    help='upper epoch limit')
+parser.add_argument('--decay_rate', type=float, default=0.5,
+                    help='decay factor when ReduceLROnPlateau is used')
+parser.add_argument('--lr_min', type=float, default=0.0,
+                    help='minimum learning rate during annealing')
+parser.add_argument('--clip', type=float, default=0.25,         
+                    help='gradient clipping')
+parser.add_argument('--clip_nonemb', action='store_true',
+                    help='only clip the gradient of non-embedding params')
+parser.add_argument('--max_step', type=int, default=1000,
+                    help='upper epoch limit')
+parser.add_argument('--batch_size', type=int, default=22,
+                    help='batch size')
+parser.add_argument('--batch_chunk', type=int, default=1,
+                    help='split batch into chunks to save memory')
+parser.add_argument('--tgt_len', type=int, default=512,
+                    help='number of tokens to predict')
+parser.add_argument('--eval_tgt_len', type=int, default=128,
+                    help='number of tokens to predict for evaluation')
+parser.add_argument('--ext_len', type=int, default=0,
+                    help='length of the extended context')
+parser.add_argument('--mem_len', type=int, default=512,
+                    help='length of the retained previous heads')
+parser.add_argument('--not_tied', action='store_true',
+                    help='do not tie the word embedding and softmax weights')
+parser.add_argument('--seed', type=int, default=1111,
+                    help='random seed')
+# parser.add_argument('--npu', default=True, help='use NPU')
+parser.add_argument('--adaptive', action='store_true',
+                    help='use adaptive softmax')
+parser.add_argument('--div_val', type=int, default=1,
+                    help='divident value for adapative input and softmax')
+parser.add_argument('--pre_lnorm', action='store_true',
+                    help='apply LayerNorm to the input instead of the output')
+parser.add_argument('--varlen', action='store_true',
+                    help='use variable length')
+parser.add_argument('--multi_gpu', action='store_true',
+                    help='use multiple GPU')
+parser.add_argument('--log-interval', type=int, default=200,
+                    help='report interval')
+parser.add_argument('--eval-interval', type=int, default=1000,
+                    help='evaluation interval')
+parser.add_argument('--work_dir', default='LM-TFM', type=str,
+                    help='experiment directory.')
+parser.add_argument('--restart', action='store_true',
+                    help='restart training from the saved checkpoint')
+parser.add_argument('--restart_dir', type=str, default='',
+                    help='restart dir')
+parser.add_argument('--debug', action='store_true',
+                    help='run in debug mode (do not create exp dir)')
+parser.add_argument('--same_length', action='store_true',
+                    help='use the same attn length for all tokens')
+parser.add_argument('--attn_type', type=int, default=0,
+                    help='attention type. 0 for ours, 1 for Shaw et al,'
+                    '2 for Vaswani et al, 3 for Al Rfou et al.')
+parser.add_argument('--clamp_len', type=int, default=-1,
+                    help='use the same pos embeddings after clamp_len')
+parser.add_argument('--eta_min', type=float, default=0.0,
+                    help='min learning rate for cosine scheduler')
+parser.add_argument('--gpu0_bsz', type=int, default=-1,
+                    help='batch size on gpu 0')
+parser.add_argument('--max_eval_steps', type=int, default=-1,
+                    help='max eval steps')
+parser.add_argument('--sample_softmax', type=int, default=-1,
+                    help='number of samples in sampled softmax')
+parser.add_argument('--patience', type=int, default=0,
+                    help='patience')
+parser.add_argument('--finetune_v2', action='store_true',
+                    help='finetune v2')
+parser.add_argument('--finetune_v3', action='store_true',
+                    help='finetune v3')
+parser.add_argument('--static-loss-scale', type=float, default=128.0,
+                    help='Static loss scale, positive power of 2 values can '
+                    'improve fp16 convergence.')
+parser.add_argument('--dynamic-loss-scale', action='store_true',
+                    help='Use dynamic loss scaling.  If supplied, this argument'
+                    ' supersedes --static-loss-scale.')
+#edit this for 8p
+parser.add_argument('--dist-backend', type=str, default='hccl')
+parser.add_argument('--world-size', type=int, default=-1)
+parser.add_argument('--rank', type=int, default=-1)
+parser.add_argument('--local_rank', type=int, default=0)
+parser.add_argument('--addr', type=str, default='127.0.0.1')
+parser.add_argument('--device_num', type=int, default=-1)
+parser.add_argument('--workers', type=int, default=32)
+parser.add_argument('--device-list', default='', type=str)
+parser.add_argument('--dist-url', type=str, default='tcp://127.0.0.1:50000')
+parser.add_argument('--device', type=str, default='npu')
+parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.')
+parser.add_argument('--multiprocessing-distributed', action='store_true',
+                    help='Use multi-processing distributed training to launch '
+                         'N processes per node, which has N GPUs. This is the '
+                         'fastest way to use PyTorch for either single node or '
+                         'multi node data parallel training')
+warnings.filterwarnings('ignore')
+#############end#################
+
+def main():
+    args = parser.parse_args()
+    args.tied = not args.not_tied
+    torch.manual_seed(args.seed)
+
+    global train_step, train_loss, best_val_loss, eval_start_time, log_start_time
+    ##############################
+    # edit this for 8p
+    os.environ['MASTER_ADDR'] = args.addr
+    os.environ['MASTER_PORT'] = '29888'
+    os.environ['LOCAL_DEVICE_ID'] = str(0)
+    print("+++++++++++++++++++++++++++LOCAL_DEVICE_ID:", os.environ['LOCAL_DEVICE_ID'])
+    if args.dist_url == "env://" and args.world_size == -1:
+        args.world_size = int(os.environ["WORLD_SIZE"])
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+    if args.device_list != '':
+        ngpus_per_node = len(args.device_list.split(','))
+    elif args.device_num != -1:
+        ngpus_per_node = args.device_num
+    elif args.device == 'npu':
+        ngpus_per_node = int(os.environ["RANK_SIZE"])
+    else:
+        ngpus_per_node = torch.cuda.device_count()
+    if args.multiprocessing_distributed:
+        args.world_size = ngpus_per_node * args.world_size
+        if args.device == 'npu':
+           main_worker(args.local_rank, ngpus_per_node,args) 
+    else:
+        main_worker(args.gpu, ngpus_per_node, args)
+    ##############################
+
+
+def main_worker(gpu, ngpus_per_node, args):
+
+    global train_step, train_loss, best_val_loss, eval_start_time, log_start_time
+    if args.d_embed < 0:
+        args.d_embed = args.d_model
+
+    assert args.ext_len >= 0, 'extended context length must be non-negative'
+    assert args.batch_size % args.batch_chunk == 0
+
+    args.work_dir = '{}-{}'.format(args.work_dir, args.dataset)
+    args.work_dir = os.path.join(args.work_dir, time.strftime('%Y%m%d-%H%M%S'))
+    logging = create_exp_dir(args.work_dir,
+        scripts_to_save=['train.py', 'mem_transformer.py'], debug=args.debug)
+
+    if args.device_list != '':
+        args.gpu = int(args.device_list.split(',')[gpu])
+    else:
+        args.gpu = gpu
+
+    print("[npu id:", args.gpu, "]", "++++++++++++++++ before set LOCAL_DEVICE_ID:", os.environ['LOCAL_DEVICE_ID'])
+    os.environ['LOCAL_DEVICE_ID'] = str(args.gpu)
+    print("[npu id:", args.gpu, "]", "++++++++++++++++ LOCAL_DEVICE_ID:", os.environ['LOCAL_DEVICE_ID'])
+
+    if args.gpu is not None:
+        print("[npu id:", args.gpu, "]", "Use GPU: {} for training".format(args.gpu))
+
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            args.rank = args.rank * ngpus_per_node + gpu
+
+        if args.device == 'npu':
+            dist.init_process_group(backend=args.dist_backend,
+                                    world_size=args.world_size, rank=args.rank)
+        else:
+            dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                    world_size=args.world_size, rank=args.rank)
+
+    loc = 'npu:{}'.format(args.gpu)
+    torch.npu.set_device(loc)
+
+    args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+
+    print("[npu id:", args.gpu, "]", "===============main_worker()=================")
+    print("[npu id:", args.gpu, "]", args)
+    print("[npu id:", args.gpu, "]", "===============main_worker()=================")
+
+
+    ###############################################################################
+    # Load data
+    ###############################################################################
+    corpus = get_lm_corpus(args.data, args.dataset)
+    ntokens = len(corpus.vocab)
+    args.n_token = ntokens
+
+    eval_batch_size = 10
+    tr_iter = corpus.get_iterator('train', args.batch_size, args.tgt_len,
+        device=loc, ext_len=args.ext_len)
+    va_iter = corpus.get_iterator('valid', eval_batch_size, args.eval_tgt_len,
+        device=loc, ext_len=args.ext_len)
+    te_iter = corpus.get_iterator('test.py', eval_batch_size, args.eval_tgt_len,
+        device=loc, ext_len=args.ext_len)
+
+    # adaptive softmax / embedding
+    cutoffs, tie_projs = [], [False]
+    if args.adaptive:
+        assert args.dataset in ['wt103', 'lm1b']
+        if args.dataset == 'wt103':
+            cutoffs = [20000, 40000, 200000]
+            tie_projs += [True] * len(cutoffs)
+        elif args.dataset == 'lm1b':
+            cutoffs = [60000, 100000, 640000]
+            tie_projs += [False] * len(cutoffs)
+
+    ###############################################################################
+    # Build the model
+    ###############################################################################
+    def init_weight(weight):
+        if args.init == 'uniform':
+            nn.init.uniform_(weight, -args.init_range, args.init_range)
+        elif args.init == 'normal':
+            nn.init.normal_(weight, 0.0, args.init_std)
+
+    def init_bias(bias):
+        nn.init.constant_(bias, 0.0)
+
+    def weights_init(m):
+        classname = m.__class__.__name__
+        if classname.find('Linear') != -1:
+            if hasattr(m, 'weight') and m.weight is not None:
+                init_weight(m.weight)
+            if hasattr(m, 'bias') and m.bias is not None:
+                init_bias(m.bias)
+        elif classname.find('AdaptiveEmbedding') != -1:
+            if hasattr(m, 'emb_projs'):
+                for i in range(len(m.emb_projs)):
+                    if m.emb_projs[i] is not None:
+                        nn.init.normal_(m.emb_projs[i], 0.0, args.proj_init_std)
+        elif classname.find('Embedding') != -1:
+            if hasattr(m, 'weight'):
+                init_weight(m.weight)
+        elif classname.find('ProjectedAdaptiveLogSoftmax') != -1:
+            if hasattr(m, 'cluster_weight') and m.cluster_weight is not None:
+                init_weight(m.cluster_weight)
+            if hasattr(m, 'cluster_bias') and m.cluster_bias is not None:
+                init_bias(m.cluster_bias)
+            if hasattr(m, 'out_projs'):
+                for i in range(len(m.out_projs)):
+                    if m.out_projs[i] is not None:
+                        nn.init.normal_(m.out_projs[i], 0.0, args.proj_init_std)
+        elif classname.find('LayerNorm') != -1:
+            if hasattr(m, 'weight'):
+                nn.init.normal_(m.weight, 1.0, args.init_std)
+            if hasattr(m, 'bias') and m.bias is not None:
+                init_bias(m.bias)
+        elif classname.find('TransformerLM') != -1:
+            if hasattr(m, 'r_emb'):
+                init_weight(m.r_emb)
+            if hasattr(m, 'r_w_bias'):
+                init_weight(m.r_w_bias)
+            if hasattr(m, 'r_r_bias'):
+                init_weight(m.r_r_bias)
+            if hasattr(m, 'r_bias'):
+                init_bias(m.r_bias)
+
+    def update_dropout(m):
+        classname = m.__class__.__name__
+        if classname.find('Dropout') != -1:
+            if hasattr(m, 'p'):
+                m.p = args.dropout
+
+    def update_dropatt(m):
+        if hasattr(m, 'dropatt'):
+            m.dropatt.p = args.dropatt
+
+    if args.restart:
+        with open(os.path.join(args.restart_dir, 'model.pt'), 'rb') as f:
+            model = MemTransformerLM(ntokens, args.n_layer, args.n_head, args.d_model,
+                                     args.d_head, args.d_inner, args.dropout, args.dropatt,
+                                     tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val,
+                                     tie_projs=tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len,
+                                     ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs,
+                                     same_length=args.same_length, attn_type=args.attn_type,
+                                     clamp_len=args.clamp_len, sample_softmax=args.sample_softmax)
+            model.apply(weights_init)
+            model.word_emb.apply(weights_init)
+            model = model.to(loc)
+            ckpt = torch.load(f, map_location=loc)
+            model.load_state_dict(ckpt)
+        model.apply(update_dropout)
+        model.apply(update_dropatt)
+    else:
+        model = MemTransformerLM(ntokens, args.n_layer, args.n_head, args.d_model,
+            args.d_head, args.d_inner, args.dropout, args.dropatt,
+            tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val,
+            tie_projs=tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len,
+            ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs,
+            same_length=args.same_length, attn_type=args.attn_type,
+            clamp_len=args.clamp_len, sample_softmax=args.sample_softmax)
+        model.apply(weights_init)
+        model.word_emb.apply(weights_init) # ensure embedding init is not overridden by out_layer in case of weight sharing
+
+
+
+    args.n_all_param = sum([p.nelement() for p in model.parameters()])
+    args.n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()])
+
+
+
+    #### optimizer
+    if args.optim.lower() == 'sgd':
+        if args.sample_softmax > 0:
+            dense_params, sparse_params = [], []
+            for param in model.parameters():
+                if param.size() == model.word_emb.weight.size():
+                    sparse_params.append(param)
+                else:
+                    dense_params.append(param)
+            optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2)
+            optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom)
+        else:
+            optimizer = optim.SGD(model.parameters(), lr=args.lr,
+                momentum=args.mom)
+    elif args.optim.lower() == 'adam':
+        if args.sample_softmax > 0:
+            dense_params, sparse_params = [], []
+            for param in model.parameters():
+                if param.size() == model.word_emb.weight.size():
+                    sparse_params.append(param)
+                else:
+                    dense_params.append(param)
+            optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr)
+            optimizer = optim.Adam(dense_params, lr=args.lr)
+        else:
+            #optimizer = optim.Adam(model.parameters(), lr=args.lr)
+            optimizer = apex.optimizers.NpuFusedAdam(model.parameters(), lr=args.lr)
+    elif args.optim.lower() == 'adagrad':
+        optimizer = optim.Adagrad(model.parameters(), lr=args.lr)
+
+    model = model.to(loc)
+    ###################################################################################################
+    opt_level = "O2"
+    model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level, loss_scale=128.0, combine_grad=True)
+    ###################################################################################################
+
+    if args.multi_gpu:
+
+        if args.gpu0_bsz >= 0:
+            para_model = BalancedDataParallel(args.gpu0_bsz // args.batch_chunk,
+                                              model, dim=1).to(loc)
+        else:
+            para_model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False)
+    else:
+        para_model = model.to(loc)
+
+    #### scheduler
+    if args.scheduler == 'cosine':
+        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
+            args.max_step, eta_min=args.eta_min)
+        if args.sample_softmax > 0:
+            scheduler_sparse = optim.lr_scheduler.CosineAnnealingLR(optimizer_sparse,
+                args.max_step, eta_min=args.eta_min)
+    elif args.scheduler == 'inv_sqrt':
+        def lr_lambda(step):
+            if step == 0 and args.warmup_step == 0:
+                return 1.
+            else:
+                return 1. / (step ** 0.5) if step > args.warmup_step \
+                       else step / (args.warmup_step ** 1.5)
+        scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
+    elif args.scheduler == 'dev_perf':
+        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
+            factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min)
+        if args.sample_softmax > 0:
+            scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau(optimizer_sparse,
+                factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min)
+    elif args.scheduler == 'constant':
+        pass
+
+
+    if args.restart:
+        if os.path.exists(os.path.join(args.restart_dir, 'optimizer.pt')):
+            with open(os.path.join(args.restart_dir, 'optimizer.pt'), 'rb') as f:
+                opt_state_dict = torch.load(f, map_location=loc)
+                optimizer.load_state_dict(opt_state_dict)
+        else:
+            print('Optimizer was not saved. Start from scratch.')
+
+    logging('=' * 100)
+    for k, v in args.__dict__.items():
+        logging('    - {} : {}'.format(k, v))
+    logging('=' * 100)
+    logging('#params = {}'.format(args.n_all_param))
+    logging('#non emb params = {}'.format(args.n_nonemb_param))
+
+    ###############################################################################
+    # Training code
+    ###############################################################################
+
+    def evaluate(eval_iter):
+        model.eval()
+        if args.mem_len == 0:
+            model.reset_length(args.eval_tgt_len,
+                               args.ext_len+args.tgt_len-args.eval_tgt_len, args.mem_len)
+        else:
+            model.reset_length(args.eval_tgt_len,
+                               args.ext_len, args.mem_len+args.tgt_len-args.eval_tgt_len)
+
+        # Evaluation
+        total_len, total_loss = 0, 0.
+        with torch.no_grad():
+            mems = tuple()
+            for i, (data, target, seq_len) in enumerate(eval_iter):
+                if args.max_eval_steps > 0 and i >= args.max_eval_steps:
+                    break
+                ret = model(data, target, *mems)
+                loss, mems = ret[0], ret[1:]
+                loss = loss.mean()
+                total_loss += seq_len * loss.float().item()
+                total_len += seq_len
+
+        model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
+        model.train()
+        return total_loss / total_len
+
+
+    def train():
+        # Turn on training mode which enables dropout.
+        global train_step, train_loss, best_val_loss, eval_start_time, log_start_time
+
+        model.train()
+        if args.batch_chunk > 1:
+            mems = [tuple() for _ in range(args.batch_chunk)]
+        else:
+            mems = tuple()
+        train_iter = tr_iter.get_varlen_iter() if args.varlen else tr_iter
+        for batch, (data, target, seq_len) in enumerate(train_iter):
+            model.zero_grad()
+            if args.batch_chunk > 1:
+                data_chunks = torch.chunk(data, args.batch_chunk, 1)
+                target_chunks = torch.chunk(target, args.batch_chunk, 1)
+                for i in range(args.batch_chunk):
+                    data_i = data_chunks[i].contiguous()
+                    target_i = target_chunks[i].contiguous()
+                    ret = para_model(data_i, target_i, *mems[i])
+                    loss, mems[i] = ret[0], ret[1:]
+                    loss = loss.float().mean().type_as(loss) / args.batch_chunk
+                    ####################################################################
+                    with amp.scale_loss(loss, optimizer) as scaled_loss:
+                        scaled_loss.backward()
+                    ####################################################################
+                    with torch.no_grad():
+                        train_loss += loss.float().bool().item()
+            else:
+               ret = para_model(data, target, *mems)
+               loss, mems = ret[0], ret[1:]
+               loss = loss.float().mean().type_as(loss)
+               ####################################################
+               with torch.no_grad():
+                   train_loss += loss.float().item()
+               ###################################################################
+               with amp.scale_loss(loss, optimizer) as scaled_loss:
+                   scaled_loss.backward()
+
+
+            optimizer.step()
+            if args.sample_softmax > 0:
+                optimizer_sparse.step()
+
+            # step-wise learning rate annealing
+            train_step += 1
+            if args.scheduler in ['cosine', 'constant', 'dev_perf']:
+                # linear warmup stage
+                if train_step < args.warmup_step:
+                    curr_lr = args.lr * train_step / args.warmup_step
+                    optimizer.param_groups[0]['lr'] = curr_lr
+                    if args.sample_softmax > 0:
+                        optimizer_sparse.param_groups[0]['lr'] = curr_lr * 2
+                else:
+                    if args.scheduler == 'cosine':
+                        scheduler.step(train_step)
+                        if args.sample_softmax > 0:
+                            scheduler_sparse.step(train_step)
+            elif args.scheduler == 'inv_sqrt':
+                scheduler.step(train_step)
+
+            if train_step % args.log_interval == 0:
+                cur_loss = train_loss / args.log_interval
+                elapsed = time.time() - log_start_time
+                log_str = '| epoch {:3d} step {:>8d} | {:>6d} batches | lr {:.3g} ' \
+                          '| ms/batch {:5.2f} | loss {:5.2f} | fps {:.2f}'.format(
+                    epoch, train_step, batch+1, optimizer.param_groups[0]['lr'],
+                    elapsed * 1000 / args.log_interval, cur_loss, args.log_interval*args.batch_size*args.tgt_len*8/elapsed)
+                if args.dataset in ['enwik8', 'text8']:
+                    log_str += ' | bpc {:9.5f}'.format(cur_loss / math.log(2))
+                else:
+                    log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss))
+                logging(log_str)
+                train_loss = 0
+                log_start_time = time.time()
+
+            if train_step % args.eval_interval == 0:
+                ts = time.time()
+                val_loss = evaluate(va_iter)
+                print('evaluation use time {} s'.format(time.time()-ts))
+                logging('-' * 100)
+                log_str = '| Eval {:3d} at step {:>8d} | time: {:5.2f}s ' \
+                          '| valid loss {:5.2f}'.format(
+                    train_step // args.eval_interval, train_step,
+                    (time.time() - ts), val_loss)
+                if args.dataset in ['enwik8', 'text8']:
+                    log_str += ' | bpc {:9.5f}'.format(val_loss / math.log(2))
+                else:
+                    log_str += ' | valid ppl {:9.3f}'.format(math.exp(val_loss))
+                logging(log_str)
+                logging('-' * 100)
+                # Save the model if the validation loss is the best we've seen so far.
+                if not best_val_loss or val_loss < best_val_loss:
+                    if not args.debug:
+                        with open('model.pt', 'wb') as f:
+                            torch.save(model.state_dict(), f)
+                        with open('optimizer.pt', 'wb') as f:
+                            torch.save(optimizer.state_dict(), f)
+                    best_val_loss = val_loss
+
+                # dev-performance based learning rate annealing
+                if args.scheduler == 'dev_perf':
+                    scheduler.step(val_loss)
+                    if args.sample_softmax > 0:
+                        scheduler_sparse.step(val_loss)
+
+                eval_start_time = time.time()
+
+            if train_step == args.max_step:
+                sys.exit()    
+
+    # At any point you can hit Ctrl + C to break out of training early.
+    try:
+        for epoch in itertools.count(start=1):
+            train()
+            if train_step == args.max_step:
+                logging('-' * 100)
+                logging('End of training')
+                sys.exit() 
+    except KeyboardInterrupt:
+        logging('-' * 100)
+        logging('Exiting from training early')
+
+    # # Load the best saved model.
+    # with open('model.pt', 'rb') as f:
+    #     model.load_state_dict(torch.load(f, map_location=loc))
+    # para_model = model.to(loc)
+
+    # # Run on test data.
+    # test_loss = evaluate(te_iter)
+    # logging('=' * 100)
+    # if args.dataset in ['enwik8', 'text8']:
+    #     logging('| End of training | test loss {:5.2f} | test bpc {:9.5f}'.format(
+    #         test_loss, test_loss / math.log(2)))
+    # else:
+    #     logging('| End of training | test loss {:5.2f} | test ppl {:9.3f}'.format(
+    #         test_loss, math.exp(test_loss)))
+    # logging('=' * 100)
+
+
+if __name__ == '__main__':
+    global train_step, train_loss, best_val_loss, eval_start_time, log_start_time
+    train_step = 0
+    train_loss = 0
+    best_val_loss = None
+    log_start_time = time.time()
+    eval_start_time = time.time()
+    main()
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/.keep b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/.keep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/adaptive_softmax.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/adaptive_softmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c54a69204525d62466eb58245af2a3165798bed
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/adaptive_softmax.py
@@ -0,0 +1,102 @@
+# coding: UTF-8
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class AdaptiveLogSoftmax(nn.Module):
+    def __init__(self, in_features, n_classes, cutoffs, keep_order=False):
+        super(AdaptiveLogSoftmax, self).__init__()
+
+        cutoffs = list(cutoffs)
+
+        if (cutoffs != sorted(cutoffs)) \
+                or (min(cutoffs) <= 0) \
+                or (max(cutoffs) >= (n_classes - 1)) \
+                or (len(set(cutoffs)) != len(cutoffs)) \
+                or any([int(c) != c for c in cutoffs]):
+
+            raise ValueError("cutoffs should be a sequence of unique, positive "
+                             "integers sorted in an increasing order, where "
+                             "each value is between 1 and n_classes-1")
+
+        self.in_features = in_features
+        self.n_classes = n_classes
+        self.cutoffs = cutoffs + [n_classes]
+
+        self.shortlist_size = self.cutoffs[0]
+        self.n_clusters = len(self.cutoffs) - 1
+        self.head_size = self.shortlist_size + self.n_clusters
+
+        self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.in_features))
+        self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))
+
+        self.keep_order = keep_order
+
+
+    def forward(self, hidden, target, weight, bias, keep_order=False):
+        if hidden.size(0) != target.size(0):
+            raise RuntimeError('Input and target should have the same size '
+                               'in the batch dimension.')
+
+        head_weight = torch.cat(
+            [weight[:self.shortlist_size], self.cluster_weight], dim=0)
+        head_bias = torch.cat(
+            [bias[:self.shortlist_size], self.cluster_bias], dim=0)
+
+        head_logit = F.linear(hidden, head_weight, bias=head_bias)
+        head_logprob = F.log_softmax(head_logit, dim=1)
+
+        nll = torch.zeros_like(target,
+                dtype=hidden.dtype, device=hidden.device)
+
+        offset = 0
+        cutoff_values = [0] + self.cutoffs
+        for i in range(len(cutoff_values) - 1):
+            l_idx, h_idx = cutoff_values[i], cutoff_values[i + 1]
+
+            mask_i = (target >= l_idx) & (target < h_idx)
+            indices_i = mask_i.nonzero().squeeze()
+
+            if indices_i.numel() == 0:
+                continue
+
+            target_i = target.index_select(0, indices_i) - l_idx
+            head_logprob_i = head_logprob.index_select(0, indices_i)
+
+            if i == 0:
+                logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1)
+            else:
+                weight_i = weight[l_idx:h_idx]
+                bias_i = bias[l_idx:h_idx]
+
+                hidden_i = hidden.index_select(0, indices_i)
+
+                tail_logit_i = F.linear(hidden_i, weight_i, bias=bias_i)
+                tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
+
+                print(f'target_i[:,None]: {target_i[:, None]}')
+                print(f'target_i[:,None].shape: {target_i[:, None].shape}')
+                logprob_i = head_logprob_i[:, -i] \
+                          + tail_logprob_i.gather(1, target_i[:,None]).squeeze(1)
+
+            if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
+                nll.index_copy_(0, indices_i, -logprob_i)
+            else:
+                nll[offset:offset+logprob_i.size(0)].copy_(-logprob_i)
+
+            offset += logprob_i.size(0)
+
+        return nll
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/data_parallel.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/data_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b48aaaa644e8310cce7cb4d04d14b9d832d39ff
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/data_parallel.py
@@ -0,0 +1,109 @@
+# coding: UTF-8
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from torch.nn.parallel import DataParallel
+import torch
+from torch.nn.parallel._functions import Scatter
+from torch.nn.parallel.parallel_apply import parallel_apply
+
+def scatter(inputs, target_gpus, chunk_sizes, dim=0):
+    r"""
+    Slices tensors into approximately equal chunks and
+    distributes them across given GPUs. Duplicates
+    references to objects that are not tensors.
+    """
+    def scatter_map(obj):
+        if isinstance(obj, torch.Tensor):
+            try:
+                return Scatter.apply(target_gpus, chunk_sizes, dim, obj)
+            except:
+                print('obj', obj.size())
+                print('dim', dim)
+                print('chunk_sizes', chunk_sizes)
+                quit()
+        if isinstance(obj, tuple) and len(obj) > 0:
+            return list(zip(*map(scatter_map, obj)))
+        if isinstance(obj, list) and len(obj) > 0:
+            return list(map(list, zip(*map(scatter_map, obj))))
+        if isinstance(obj, dict) and len(obj) > 0:
+            return list(map(type(obj), zip(*map(scatter_map, obj.items()))))
+        return [obj for targets in target_gpus]
+
+    try:
+        return scatter_map(inputs)
+    finally:
+        scatter_map = None
+
+def scatter_kwargs(inputs, kwargs, target_gpus, chunk_sizes, dim=0):
+    r"""Scatter with support for kwargs dictionary"""
+    inputs = scatter(inputs, target_gpus, chunk_sizes, dim) if inputs else []
+    kwargs = scatter(kwargs, target_gpus, chunk_sizes, dim) if kwargs else []
+    if len(inputs) < len(kwargs):
+        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+    elif len(kwargs) < len(inputs):
+        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+    inputs = tuple(inputs)
+    kwargs = tuple(kwargs)
+    return inputs, kwargs
+
+class BalancedDataParallel(DataParallel):
+    def __init__(self, gpu0_bsz, *args, **kwargs):
+        self.gpu0_bsz = gpu0_bsz
+        super().__init__(*args, **kwargs)
+
+    def forward(self, *inputs, **kwargs):
+        if not self.device_ids:
+            return self.module(*inputs, **kwargs)
+        if self.gpu0_bsz == 0:
+            device_ids = self.device_ids[1:]
+        else:
+            device_ids = self.device_ids
+        inputs, kwargs = self.scatter(inputs, kwargs, device_ids)
+        if len(self.device_ids) == 1:
+            return self.module(*inputs[0], **kwargs[0])
+        replicas = self.replicate(self.module, self.device_ids)
+        if self.gpu0_bsz == 0:
+            replicas = replicas[1:]
+        outputs = self.parallel_apply(replicas, device_ids, inputs, kwargs)
+
+        ######################################3
+        # outputs=outputs.to('cpu')
+        # self.output_device='cpu'
+        print(f'outputs: {outputs}')
+        print(f'type(outputs): {type(outputs)}')
+        print(f'len(outputs): {len(outputs)}')
+        print(f'self.output_device: {self.output_device}')
+
+
+        return self.gather(outputs, self.output_device)
+
+    def parallel_apply(self, replicas, device_ids, inputs, kwargs):
+        return parallel_apply(replicas, inputs, kwargs, device_ids)
+
+    def scatter(self, inputs, kwargs, device_ids):
+        bsz = inputs[0].size(self.dim)
+        num_dev = len(self.device_ids)
+        gpu0_bsz = self.gpu0_bsz
+        bsz_unit = (bsz - gpu0_bsz) // (num_dev - 1)
+        if gpu0_bsz < bsz_unit:
+            chunk_sizes = [gpu0_bsz] + [bsz_unit] * (num_dev - 1)
+            delta = bsz - sum(chunk_sizes)
+            for i in range(delta):
+                chunk_sizes[i + 1] += 1
+            if gpu0_bsz == 0:
+                chunk_sizes = chunk_sizes[1:]
+        else:
+            return super().scatter(inputs, kwargs, device_ids)
+        return scatter_kwargs(inputs, kwargs, device_ids, chunk_sizes, dim=self.dim)
+
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/exp_utils.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/exp_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f290b8e70eed7448095e9af4b97341d0e89644eb
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/exp_utils.py
@@ -0,0 +1,40 @@
+# coding: UTF-8
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import os
+import shutil
+import torch
+
+
+def logging(s, log_path, print_=True, log_=True):
+    if print_:
+        print(s)
+    if log_:
+        with open(log_path, 'a+') as f_log:   
+            f_log.write(s + '\n')
+
+
+def get_logger(log_path, **kwargs):    
+    return functools.partial(logging, log_path=log_path, **kwargs)    
+
+
+def create_exp_dir(dir_path, scripts_to_save=None, debug=False):    
+    print('Experiment dir : {}'.format(dir_path))
+    return get_logger(log_path='log.txt')
+
+
+def save_checkpoint(model, optimizer, path, epoch):
+    torch.save(model, os.path.join(path, 'model_{}.pt'.format(epoch)))
+    torch.save(optimizer.state_dict(), os.path.join(path, 'optimizer_{}.pt'.format(epoch)))
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/log_uniform_sampler.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/log_uniform_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ebe1297479dd63b01cfcc2d553760fff26947b0
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/log_uniform_sampler.py
@@ -0,0 +1,111 @@
+# coding: UTF-8
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch import nn
+
+class LogUniformSampler(object):
+    def __init__(self, range_max, n_sample):
+        """
+        Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
+            `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
+
+        expected count can be approximated by 1 - (1 - p)^n
+        and we use a numerically stable version -expm1(num_tries * log1p(-p))
+
+        Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run
+        """
+        with torch.no_grad():
+            self.range_max = range_max
+            log_indices = torch.arange(1., range_max+2., 1.).log_()
+            self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
+            # print('P', self.dist.numpy().tolist()[-30:])
+
+            self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
+
+        self.n_sample = n_sample
+
+    def sample(self, labels):
+        """
+            labels: [b1, b2]
+        Return
+            true_log_probs: [b1, b2]
+            samp_log_probs: [n_sample]
+            neg_samples: [n_sample]
+        """
+
+        n_sample = self.n_sample
+        n_tries = 2 * n_sample
+
+        with torch.no_grad():
+            neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique()
+            device = labels.device
+            neg_samples = neg_samples.to(device)
+            true_log_probs = self.log_q[labels].to(device)
+            samp_log_probs = self.log_q[neg_samples].to(device)
+            return true_log_probs, samp_log_probs, neg_samples
+
+def sample_logits(embedding, bias, labels, inputs, sampler):
+    """
+        embedding: an nn.Embedding layer
+        bias: [n_vocab]
+        labels: [b1, b2]
+        inputs: [b1, b2, n_emb]
+        sampler: you may use a LogUniformSampler
+    Return
+        logits: [b1, b2, 1 + n_sample]
+    """
+    true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels)
+    n_sample = neg_samples.size(0)
+    b1, b2 = labels.size(0), labels.size(1)
+    all_ids = torch.cat([labels.view(-1), neg_samples])
+    all_w = embedding(all_ids)
+    true_w = all_w[: -n_sample].view(b1, b2, -1)
+    sample_w = all_w[- n_sample:].view(n_sample, -1)
+
+    all_b = bias[all_ids]
+    true_b = all_b[: -n_sample].view(b1, b2)
+    sample_b = all_b[- n_sample:]
+
+    hit = (labels[:, :, None] == neg_samples).detach()
+
+    true_logits = torch.einsum('ijk,ijk->ij',
+        [true_w, inputs]) + true_b - true_log_probs
+    sample_logits = torch.einsum('lk,ijk->ijl',
+        [sample_w, inputs]) + sample_b - samp_log_probs
+    sample_logits.masked_fill_(hit, -1e30)
+    logits = torch.cat([true_logits[:, :, None], sample_logits], -1)
+
+    return logits
+
+
+if __name__ == '__main__':
+    S, B = 3, 4
+    n_vocab = 10000
+    n_sample = 5
+    H = 32
+
+    labels = torch.LongTensor(S, B).random_(0, n_vocab)
+    sampler = LogUniformSampler(n_vocab, unique=True)
+
+    embedding = nn.Embedding(n_vocab, H)
+    bias = torch.zeros(n_vocab)
+    inputs = torch.Tensor(S, B, H).normal_()
+
+    logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample)
+    print('logits', logits.detach().numpy().tolist())
+    print('logits shape', logits.size())
+    print('out_labels', out_labels.detach().numpy().tolist())
+    print('out_labels shape', out_labels.size())
+
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/proj_adaptive_softmax.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/proj_adaptive_softmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..886757190ce2b308dc3e3d6e66762f623cb9878a
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/proj_adaptive_softmax.py
@@ -0,0 +1,160 @@
+# coding: UTF-8
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+####################################################################################
+# edit
+# CUDA_MAJOR = int(torch.version.cuda.split('.')[0])    # 主cuda
+# CUDA_MINOR = int(torch.version.cuda.split('.')[1])    # 辅cuda
+#####################################################################################
+
+class ProjectedAdaptiveLogSoftmax(nn.Module):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
+                 keep_order=False):
+        super(ProjectedAdaptiveLogSoftmax, self).__init__()
+        self.n_token = n_token
+        self.d_embed = d_embed
+        self.d_proj = d_proj
+
+        self.cutoffs = cutoffs + [n_token]
+        self.cutoff_ends = [0] + self.cutoffs
+        self.div_val = div_val
+
+        self.shortlist_size = self.cutoffs[0]
+        self.n_clusters = len(self.cutoffs) - 1
+        self.head_size = self.shortlist_size + self.n_clusters
+
+        if self.n_clusters > 0:
+            self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed))
+            self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))
+
+        self.out_layers = nn.ModuleList()
+        self.out_projs = nn.ParameterList()
+
+        if div_val == 1:
+            for i in range(len(self.cutoffs)):
+                if d_proj != d_embed:
+                    self.out_projs.append(
+                        nn.Parameter(torch.Tensor(d_proj, d_embed))
+                    )
+                else:
+                    self.out_projs.append(None)
+
+            self.out_layers.append(nn.Linear(d_embed, n_token))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                d_emb_i = d_embed // (div_val ** i)
+
+                self.out_projs.append(
+                    nn.Parameter(torch.Tensor(d_proj, d_emb_i))
+                )
+
+                self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx))
+
+        self.keep_order = keep_order
+
+    def _compute_logit(self, hidden, weight, bias, proj):
+        if proj is None:
+            logit = F.linear(hidden, weight, bias=bias)
+        else:
+            proj_hid = F.linear(hidden, proj.t().contiguous())
+            logit = F.linear(proj_hid, weight, bias=bias)
+
+        return logit
+
+    def forward(self, hidden, target, keep_order=False):
+        '''
+            hidden :: [len*bsz x d_proj]
+            target :: [len*bsz]
+        '''
+
+        if hidden.size(0) != target.size(0):
+            raise RuntimeError('Input and target should have the same size '
+                               'in the batch dimension.')
+
+        if self.n_clusters == 0:
+            logit = self._compute_logit(hidden, self.out_layers[0].weight,
+                                        self.out_layers[0].bias, self.out_projs[0])
+
+            torch.save(logit,"logit.pt")
+
+            nll = -F.log_softmax(logit, dim=-1).gather(1, target.unsqueeze(1).long()).squeeze(1)
+
+        else:
+            weights, biases = [], []
+            for i in range(len(self.cutoffs)):
+                if self.div_val == 1:
+                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                    weight_i = self.out_layers[0].weight[l_idx:r_idx]
+                    bias_i = self.out_layers[0].bias[l_idx:r_idx]
+                else:
+                    weight_i = self.out_layers[i].weight
+                    bias_i = self.out_layers[i].bias
+
+                if i == 0:
+                    weight_i = torch.cat(
+                        [weight_i, self.cluster_weight], dim=0)
+                    bias_i = torch.cat(
+                        [bias_i, self.cluster_bias], dim=0)
+
+                weights.append(weight_i)
+                biases.append(bias_i)
+
+            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
+
+            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
+            head_logprob = F.log_softmax(head_logit, dim=1)
+
+            nll = torch.zeros_like(target,
+                    dtype=hidden.dtype, device=hidden.device)
+
+            offset = 0
+            cutoff_values = [0] + self.cutoffs
+            for i in range(len(cutoff_values) - 1):
+                l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]
+
+                mask_i = (target >= l_idx) & (target < r_idx)
+                indices_i = mask_i.nonzero().squeeze()
+
+                if indices_i.numel() == 0:
+                    continue
+
+                target_i = target.index_select(0, indices_i) - l_idx
+                head_logprob_i = head_logprob.index_select(0, indices_i)
+
+                if i == 0:
+                    logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1)
+                else:
+                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
+
+                    hidden_i = hidden.index_select(0, indices_i)
+
+                    tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)
+                    tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
+
+                    logprob_i = head_logprob_i[:, -i] \
+                              + tail_logprob_i.gather(1, target_i[:,None]).squeeze(1)
+
+                if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
+                    nll.index_copy_(0, indices_i, -logprob_i)
+                else:
+                    nll[offset:offset+logprob_i.size(0)].copy_(-logprob_i)
+
+                offset += logprob_i.size(0)
+
+        return nll
diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/vocabulary.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/vocabulary.py
new file mode 100644
index 0000000000000000000000000000000000000000..a13e4183c243befab6f7167719931782f67c843c
--- /dev/null
+++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/vocabulary.py
@@ -0,0 +1,178 @@
+# coding: UTF-8
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from collections import Counter, OrderedDict
+import torch
+
+
+class Vocab(object):
+    def __init__(self, special=[], min_freq=0, max_size=None, lower_case=True,
+                 delimiter=None, vocab_file=None):
+        self.counter = Counter()
+        self.special = special
+        self.min_freq = min_freq
+        self.max_size = max_size
+        self.lower_case = lower_case
+        self.delimiter = delimiter
+        self.vocab_file = vocab_file
+
+
+    def tokenize(self, line, add_eos=False, add_double_eos=False):
+        line = line.strip()
+        if self.lower_case:
+            line = line.lower()
+
+        if self.delimiter == '':
+            symbols = line
+        else:
+            symbols = line.split(self.delimiter)
+
+        if add_double_eos: # lm1b
+            return ['<S>'] + symbols + ['<S>']
+        elif add_eos:
+            return symbols + ['<eos>']
+        else:
+            return symbols
+
+
+
+    def count_file(self, path, verbose=False, add_eos=False):
+        if verbose: print('counting file {} ...'.format(path))
+        assert os.path.exists(path)
+
+        sents = []
+        with open(path, 'r', encoding='utf-8') as f:
+            for idx, line in enumerate(f):
+                if verbose and idx > 0 and idx % 500000 == 0:
+                    print('    line {}'.format(idx))
+                symbols = self.tokenize(line, add_eos=add_eos)
+                self.counter.update(symbols)
+                sents.append(symbols)
+
+        return sents
+
+
+    def count_sents(self, sents, verbose=False):
+        """
+            sents : a list of sentences, each a list of tokenized symbols
+        """
+        if verbose: print('counting {} sents ...'.format(len(sents)))
+        for idx, symbols in enumerate(sents):
+            if verbose and idx > 0 and idx % 500000 == 0:
+                print('    line {}'.format(idx))
+            self.counter.update(symbols)
+
+    def _build_from_file(self, vocab_file):
+        self.idx2sym = []
+        self.sym2idx = OrderedDict()
+
+        with open(vocab_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                symb = line.strip().split()[0]
+                self.add_symbol(symb)
+        self.unk_idx = self.sym2idx['<UNK>']
+
+    def build_vocab(self):
+        if self.vocab_file:
+            print('building vocab from {}'.format(self.vocab_file))
+            self._build_from_file(self.vocab_file)
+            print('final vocab size {}'.format(len(self)))
+        else:
+            print('building vocab with min_freq={}, max_size={}'.format(
+                self.min_freq, self.max_size))
+            self.idx2sym = []
+            self.sym2idx = OrderedDict()
+
+            for sym in self.special:
+                self.add_special(sym)
+
+            for sym, cnt in self.counter.most_common(self.max_size):
+                if cnt < self.min_freq: break
+                self.add_symbol(sym)
+
+            print('final vocab size {} from {} unique tokens'.format(
+                len(self), len(self.counter)))
+
+    def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
+            add_double_eos=False):
+        if verbose: print('encoding file {} ...'.format(path))
+        assert os.path.exists(path)
+        encoded = []
+        with open(path, 'r', encoding='utf-8') as f:
+            for idx, line in enumerate(f):
+                if verbose and idx > 0 and idx % 500000 == 0:
+                    print('    line {}'.format(idx))
+                symbols = self.tokenize(line, add_eos=add_eos,
+                    add_double_eos=add_double_eos)
+                encoded.append(self.convert_to_tensor(symbols))
+
+        if ordered:
+            encoded = torch.cat(encoded)
+
+        return encoded
+
+    def encode_sents(self, sents, ordered=False, verbose=False):
+        if verbose: print('encoding {} sents ...'.format(len(sents)))
+        encoded = []
+        for idx, symbols in enumerate(sents):
+            if verbose and idx > 0 and idx % 500000 == 0:
+                print('    line {}'.format(idx))
+            encoded.append(self.convert_to_tensor(symbols))
+
+        if ordered:
+            encoded = torch.cat(encoded)
+
+        return encoded
+
+    def add_special(self, sym):
+        if sym not in self.sym2idx:
+            self.idx2sym.append(sym)
+            self.sym2idx[sym] = len(self.idx2sym) - 1
+            setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym])
+
+    def add_symbol(self, sym):
+        if sym not in self.sym2idx:
+            self.idx2sym.append(sym)
+            self.sym2idx[sym] = len(self.idx2sym) - 1
+
+    def get_sym(self, idx):
+        assert 0 <= idx < len(self), 'Index {} out of range'.format(idx)
+        return self.idx2sym[idx]
+
+    def get_idx(self, sym):
+        if sym in self.sym2idx:
+            return self.sym2idx[sym]
+        else:
+            assert '<eos>' not in sym
+            assert hasattr(self, 'unk_idx')
+            return self.sym2idx.get(sym, self.unk_idx)
+
+    def get_symbols(self, indices):
+        return [self.get_sym(idx) for idx in indices]
+
+    def get_indices(self, symbols):
+        return [self.get_idx(sym) for sym in symbols]
+
+    def convert_to_tensor(self, symbols):
+        return torch.LongTensor(self.get_indices(symbols))
+
+    def convert_to_sent(self, indices, exclude=None):
+        if exclude is None:
+            return ' '.join([self.get_sym(idx) for idx in indices])
+        else:
+            return ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude])
+
+    def __len__(self):
+        return len(self.idx2sym)